100269 lines
1.7 MiB
100269 lines
1.7 MiB
diff --git a/Makefile b/Makefile
|
|
index fc021a9..c33edd9 100644
|
|
--- a/Makefile
|
|
+++ b/Makefile
|
|
@@ -158,18 +158,18 @@ tests : shared
|
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|
touch $(LIBNAME)
|
|
ifndef NO_FBLAS
|
|
- $(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
|
|
+ $(MAKE) -C test all
|
|
endif
|
|
endif
|
|
ifneq ($(ONLY_CBLAS), 1)
|
|
- $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
|
|
+ #$(MAKE) -C utest all
|
|
endif
|
|
ifneq ($(NO_CBLAS), 1)
|
|
ifneq ($(ONLY_CBLAS), 1)
|
|
- $(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
|
|
+ $(MAKE) -C ctest all
|
|
endif
|
|
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
|
- $(MAKE) -C cpp_thread_test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
|
|
+ $(MAKE) -C cpp_thread_test all
|
|
endif
|
|
endif
|
|
|
|
diff --git a/Makefile.sw_64 b/Makefile.sw_64
|
|
new file mode 100644
|
|
index 0000000..b4542ce
|
|
--- /dev/null
|
|
+++ b/Makefile.sw_64
|
|
@@ -0,0 +1,35 @@
|
|
+CPP = $(CC) -E
|
|
+RANLIB = ranlib
|
|
+
|
|
+ifeq ($(LIBSUBARCH), SW6)
|
|
+LIBNAME = $(LIBPREFIX)_sw6.a
|
|
+LIBNAME_P = $(LIBPREFIX)_sw6_p.a
|
|
+endif
|
|
+
|
|
+ifneq ($(COMPILER), NATIVE)
|
|
+# GCC User
|
|
+ifeq ($(LIBSUBARCH), SW6)
|
|
+OPTION += -DSW6 -mcpu=sw6
|
|
+endif
|
|
+else
|
|
+# Compaq Compiler User
|
|
+ifeq ($(LIBSUBARCH), SW6)
|
|
+OPTION += -DSW6 -tune sw6 -arch sw6
|
|
+endif
|
|
+endif
|
|
+
|
|
+ifeq ($(F_COMPILER), GFORTRAN)
|
|
+FCOMMON_OPT += -mieee
|
|
+endif
|
|
+
|
|
+ifeq ($(F_COMPILER), G77)
|
|
+FCOMMON_OPT += -mieee
|
|
+endif
|
|
+
|
|
+ifndef SMP
|
|
+LIBCXML = -lcxml -lots -lm
|
|
+LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm
|
|
+else
|
|
+LIBCXML = -lcxmlp -lots -lm
|
|
+LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm
|
|
+endif
|
|
diff --git a/Makefile.system b/Makefile.system
|
|
index 3be47c6..ae90af3 100644
|
|
--- a/Makefile.system
|
|
+++ b/Makefile.system
|
|
@@ -42,6 +42,8 @@ else ifeq ($(ARCH), mips64el)
|
|
override ARCH=mips64
|
|
else ifeq ($(ARCH), zarch)
|
|
override ARCH=zarch
|
|
+else ifeq ($(ARCH), sw_64)
|
|
+override ARCH=sw_64
|
|
endif
|
|
|
|
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
|
@@ -809,6 +811,11 @@ NO_BINARY_MODE = 1
|
|
BINARY_DEFINED = 1
|
|
endif
|
|
|
|
+ifeq ($(ARCH), sw_64)
|
|
+NO_BINARY_MODE = 1
|
|
+BINARY_DEFINED = 1
|
|
+endif
|
|
+
|
|
ifeq ($(ARCH), arm)
|
|
NO_BINARY_MODE = 1
|
|
BINARY_DEFINED = 1
|
|
diff --git a/Makefile.system.libname b/Makefile.system.libname
|
|
deleted file mode 100644
|
|
index 1b84195..0000000
|
|
--- a/Makefile.system.libname
|
|
+++ /dev/null
|
|
@@ -1,1860 +0,0 @@
|
|
-#
|
|
-# Include user definition
|
|
-#
|
|
-
|
|
-# TO suppress recursive includes
|
|
-INCLUDED = 1
|
|
-
|
|
-ifndef TOPDIR
|
|
-TOPDIR = .
|
|
-endif
|
|
-
|
|
-ifndef RELAPACK_REPLACE
|
|
-RELAPACK_REPLACE=0
|
|
-endif
|
|
-
|
|
-# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
|
|
-HOSTARCH := $(shell uname -m)
|
|
-ifeq ($(HOSTARCH), amd64)
|
|
-HOSTARCH=x86_64
|
|
-endif
|
|
-
|
|
-# Catch conflicting usage of ARCH in some BSD environments
|
|
-ifeq ($(ARCH), amd64)
|
|
-override ARCH=x86_64
|
|
-else ifeq ($(ARCH), powerpc64)
|
|
-override ARCH=power
|
|
-else ifeq ($(ARCH), powerpc64le)
|
|
-override ARCH=power
|
|
-else ifeq ($(ARCH), powerpc)
|
|
-override ARCH=power
|
|
-else ifeq ($(ARCH), i386)
|
|
-override ARCH=x86
|
|
-else ifeq ($(ARCH), armv6)
|
|
-override ARCH=arm
|
|
-else ifeq ($(ARCH), armv7)
|
|
-override ARCH=arm
|
|
-else ifeq ($(ARCH), aarch64)
|
|
-override ARCH=arm64
|
|
-else ifeq ($(ARCH), mipsel)
|
|
-override ARCH=mips
|
|
-else ifeq ($(ARCH), mips64el)
|
|
-override ARCH=mips64
|
|
-else ifeq ($(ARCH), zarch)
|
|
-override ARCH=zarch
|
|
-endif
|
|
-
|
|
-NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
|
-
|
|
-# Default C compiler
|
|
-# - Only set if not specified on the command line or inherited from the environment.
|
|
-# - CC is an implicit variable so neither '?=' or 'ifndef' can be used.
|
|
-# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
|
|
-# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
|
-ifeq ($(origin CC),default)
|
|
-
|
|
-# Check if $(CC) refers to a valid command and set the value to gcc if not
|
|
-ifneq ($(findstring cmd.exe,$(SHELL)),)
|
|
-ifeq ($(shell where $(CC) 2>NUL),)
|
|
-CC = gcc
|
|
-endif
|
|
-else # POSIX-ish
|
|
-ifeq ($(shell command -v $(CC) 2>/dev/null),)
|
|
-ifeq ($(shell uname -s),Darwin)
|
|
-CC = clang
|
|
-# EXTRALIB += -Wl,-no_compact_unwind
|
|
-else
|
|
-CC = gcc
|
|
-endif # Darwin
|
|
-endif # CC exists
|
|
-endif # Shell is sane
|
|
-
|
|
-endif # CC is set to default
|
|
-
|
|
-# Default Fortran compiler (FC) is selected by f_check.
|
|
-
|
|
-ifndef MAKEFILE_RULE
|
|
-include $(TOPDIR)/Makefile.rule
|
|
-else
|
|
-include $(TOPDIR)/$(MAKEFILE_RULE)
|
|
-endif
|
|
-
|
|
-#
|
|
-# Beginning of system configuration
|
|
-#
|
|
-ifneq ($(BUILD_SINGLE),1)
|
|
-ifneq ($(BUILD_DOUBLE),1)
|
|
-ifneq ($(BUILD_COMPLEX),1)
|
|
-ifneq ($(BUILD_COMPLEX16),1)
|
|
-override BUILD_SINGLE=1
|
|
-override BUILD_DOUBLE=1
|
|
-override BUILD_COMPLEX=1
|
|
-override BUILD_COMPLEX16=1
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifndef HOSTCC
|
|
-HOSTCC = $(CC)
|
|
-endif
|
|
-
|
|
-ifdef TARGET
|
|
-GETARCH_FLAGS := -DFORCE_$(TARGET)
|
|
-GETARCH_FLAGS += -DUSER_TARGET
|
|
-ifeq ($(TARGET), GENERIC)
|
|
-ifeq ($(DYNAMIC_ARCH), 1)
|
|
-override NO_EXPRECISION=1
|
|
-export NO_EXPRECISION
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-# Force fallbacks for 32bit
|
|
-
|
|
-ifeq ($(BINARY), 32)
|
|
-ifeq ($(TARGET), HASWELL)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET), SKYLAKEX)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET), COOPERLAKE)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET), SAPPHIRERAPIDS)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET), SANDYBRIDGE)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET), BULLDOZER)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET), PILEDRIVER)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET), STEAMROLLER)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET), EXCAVATOR)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET), ZEN)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET), ARMV8)
|
|
-GETARCH_FLAGS := -DFORCE_ARMV7
|
|
-endif
|
|
-ifeq ($(TARGET), POWER8)
|
|
-GETARCH_FLAGS := -DFORCE_POWER6
|
|
-endif
|
|
-ifeq ($(TARGET), POWER9)
|
|
-GETARCH_FLAGS := -DFORCE_POWER6
|
|
-endif
|
|
-ifeq ($(TARGET), POWER10)
|
|
-GETARCH_FLAGS := -DFORCE_POWER6
|
|
-endif
|
|
-endif
|
|
-
|
|
-#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
|
|
-#
|
|
-ifdef TARGET_CORE
|
|
-GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
|
|
-endif
|
|
-
|
|
-# Force fallbacks for 32bit
|
|
-
|
|
-ifeq ($(BINARY), 32)
|
|
-ifeq ($(TARGET_CORE), HASWELL)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), SKYLAKEX)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), COOPERLAKE)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
|
-GETARCH_FLAGS := -DFORCE_NEHALEM
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), BULLDOZER)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), PILEDRIVER)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), STEAMROLLER)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), EXCAVATOR)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-ifeq ($(TARGET_CORE), ZEN)
|
|
-GETARCH_FLAGS := -DFORCE_BARCELONA
|
|
-endif
|
|
-endif
|
|
-
|
|
-
|
|
-# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
|
-ifeq ($(HOSTARCH), x86_64)
|
|
-ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
|
|
-GETARCH_FLAGS += -march=native
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-GETARCH_FLAGS += -DUSE64BITINT
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifndef GEMM_MULTITHREAD_THRESHOLD
|
|
-GEMM_MULTITHREAD_THRESHOLD=4
|
|
-endif
|
|
-GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
|
|
-
|
|
-ifeq ($(NO_AVX), 1)
|
|
-GETARCH_FLAGS += -DNO_AVX
|
|
-endif
|
|
-
|
|
-ifeq ($(BINARY), 32)
|
|
-GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
|
|
-NO_AVX512 = 1
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_AVX2), 1)
|
|
-GETARCH_FLAGS += -DNO_AVX2
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_AVX512), 1)
|
|
-GETARCH_FLAGS += -DNO_AVX512
|
|
-endif
|
|
-
|
|
-ifeq ($(DEBUG), 1)
|
|
-GETARCH_FLAGS += -g
|
|
-endif
|
|
-
|
|
-ifeq ($(QUIET_MAKE), 1)
|
|
-MAKE += -s
|
|
-endif
|
|
-
|
|
-ifndef NO_PARALLEL_MAKE
|
|
-NO_PARALLEL_MAKE=0
|
|
-endif
|
|
-GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
|
|
-
|
|
-ifdef MAKE_NB_JOBS
|
|
-GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS)
|
|
-endif
|
|
-
|
|
-ifeq ($(HOSTCC), loongcc)
|
|
-GETARCH_FLAGS += -static
|
|
-endif
|
|
-
|
|
-#if don't use Fortran, it will only compile CBLAS.
|
|
-ifeq ($(ONLY_CBLAS), 1)
|
|
-NO_LAPACK = 1
|
|
-else
|
|
-ONLY_CBLAS = 0
|
|
-endif
|
|
-
|
|
-#For small matrix optimization
|
|
-ifeq ($(ARCH), x86_64)
|
|
-SMALL_MATRIX_OPT = 1
|
|
-else ifeq ($(ARCH), power)
|
|
-SMALL_MATRIX_OPT = 1
|
|
-BUILD_BFLOAT16 = 1
|
|
-endif
|
|
-ifeq ($(SMALL_MATRIX_OPT), 1)
|
|
-CCOMMON_OPT += -DSMALL_MATRIX_OPT
|
|
-endif
|
|
-
|
|
-# This operation is expensive, so execution should be once.
|
|
-ifndef GOTOBLAS_MAKEFILE
|
|
-export GOTOBLAS_MAKEFILE = 1
|
|
-
|
|
-# Generating Makefile.conf and config.h
|
|
-DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
|
-
|
|
-endif
|
|
-
|
|
-ifndef TARGET_CORE
|
|
--include $(TOPDIR)/Makefile.conf
|
|
-else
|
|
-HAVE_NEON=
|
|
-HAVE_VFP=
|
|
-HAVE_VFPV3=
|
|
-HAVE_VFPV4=
|
|
-HAVE_MMX=
|
|
-HAVE_SSE=
|
|
-HAVE_SSE2=
|
|
-HAVE_SSE3=
|
|
-HAVE_SSSE3=
|
|
-HAVE_SSE4_1=
|
|
-HAVE_SSE4_2=
|
|
-HAVE_SSE4A=
|
|
-HAVE_SSE5=
|
|
-HAVE_AVX=
|
|
-HAVE_AVX2=
|
|
-HAVE_FMA3=
|
|
-include $(TOPDIR)/Makefile_kernel.conf
|
|
-endif
|
|
-
|
|
-
|
|
-ifndef NUM_PARALLEL
|
|
-NUM_PARALLEL = 1
|
|
-endif
|
|
-
|
|
-ifndef NUM_THREADS
|
|
-NUM_THREADS = $(NUM_CORES)
|
|
-endif
|
|
-
|
|
-ifeq ($(NUM_THREADS), 1)
|
|
-override USE_THREAD = 0
|
|
-override USE_OPENMP = 0
|
|
-endif
|
|
-
|
|
-ifdef USE_THREAD
|
|
-ifeq ($(USE_THREAD), 0)
|
|
-SMP =
|
|
-else
|
|
-SMP = 1
|
|
-endif
|
|
-else
|
|
-ifeq ($(NUM_THREADS), 1)
|
|
-SMP =
|
|
-else
|
|
-SMP = 1
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(SMP), 1)
|
|
-USE_LOCKING =
|
|
-endif
|
|
-
|
|
-ifndef NEED_PIC
|
|
-NEED_PIC = 1
|
|
-endif
|
|
-
|
|
-ARFLAGS =
|
|
-CPP = $(COMPILER) -E
|
|
-AR ?= $(CROSS_SUFFIX)ar
|
|
-AS ?= $(CROSS_SUFFIX)as
|
|
-LD ?= $(CROSS_SUFFIX)ld
|
|
-RANLIB ?= $(CROSS_SUFFIX)ranlib
|
|
-NM = $(CROSS_SUFFIX)nm
|
|
-DLLWRAP = $(CROSS_SUFFIX)dllwrap
|
|
-OBJCOPY = $(CROSS_SUFFIX)objcopy
|
|
-OBJCONV = $(CROSS_SUFFIX)objconv
|
|
-
|
|
-
|
|
-# When fortran support was either not detected or actively deselected, only build BLAS.
|
|
-ifeq ($(NOFORTRAN), 1)
|
|
-C_LAPACK = 1
|
|
-override FEXTRALIB =
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), GCC)
|
|
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
|
-GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
|
-GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
|
-GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
|
-GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
|
|
-GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
|
-GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
|
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
|
-# Note that the behavior of -dumpversion is compile-time-configurable for
|
|
-# gcc-7.x and newer. Use -dumpfullversion there
|
|
-ifeq ($(GCCVERSIONGTEQ7),1)
|
|
- GCCDUMPVERSION_PARAM := -dumpfullversion
|
|
-else
|
|
- GCCDUMPVERSION_PARAM := -dumpversion
|
|
-endif
|
|
-GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
|
|
-GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
|
|
-GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4)
|
|
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), CLANG)
|
|
-CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
|
-CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
|
|
-endif
|
|
-
|
|
-#
|
|
-# OS dependent settings
|
|
-#
|
|
-
|
|
-ifeq ($(OSNAME), Darwin)
|
|
-ifndef MACOSX_DEPLOYMENT_TARGET
|
|
-ifeq ($(ARCH), arm64)
|
|
-export MACOSX_DEPLOYMENT_TARGET=11.0
|
|
-ifeq ($(C_COMPILER), GCC)
|
|
-export NO_SVE = 1
|
|
-endif
|
|
-else
|
|
-export MACOSX_DEPLOYMENT_TARGET=10.8
|
|
-endif
|
|
-endif
|
|
-MD5SUM = md5 -r
|
|
-XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
|
|
-ifeq (x$(XCVER)x,xx)
|
|
-XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
|
|
-endif
|
|
-ifeq (x$(XCVER), x 15)
|
|
-CCOMMON_OPT += -Wl,-ld_classic
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
|
|
-MD5SUM = md5 -r
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), NetBSD)
|
|
-MD5SUM = md5 -n
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), Linux)
|
|
-EXTRALIB += -lm
|
|
-NO_EXPRECISION = 1
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), Android)
|
|
-EXTRALIB += -lm
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), AIX)
|
|
-EXTRALIB += -lm
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
|
-ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
|
|
-EXTRALIB += -lm
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), WINNT)
|
|
-NEED_PIC = 0
|
|
-NO_EXPRECISION = 1
|
|
-
|
|
-EXTRALIB += -defaultlib:advapi32
|
|
-
|
|
-SUFFIX = obj
|
|
-PSUFFIX = pobj
|
|
-LIBSUFFIX = a
|
|
-
|
|
-ifeq ($(C_COMPILER), CLANG)
|
|
-CCOMMON_OPT += -DMS_ABI
|
|
-endif
|
|
-
|
|
-#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
|
-ifeq ($(GCCVERSIONGT4), 1)
|
|
-# GCC Major version > 4
|
|
-# It is compatible with MSVC ABI.
|
|
-CCOMMON_OPT += -DMS_ABI
|
|
-endif
|
|
-
|
|
-ifeq ($(GCCVERSIONGTEQ4), 1)
|
|
-ifeq ($(GCCMINORVERSIONGTEQ7), 1)
|
|
-# GCC Version >=4.7
|
|
-# It is compatible with MSVC ABI.
|
|
-CCOMMON_OPT += -DMS_ABI
|
|
-endif
|
|
-endif
|
|
-
|
|
-# Ensure the correct stack alignment on Win32
|
|
-# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
|
|
-ifeq ($(ARCH), x86)
|
|
-CCOMMON_OPT += -mincoming-stack-boundary=2
|
|
-FCOMMON_OPT += -mincoming-stack-boundary=2
|
|
-endif
|
|
-
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), Interix)
|
|
-NEED_PIC = 0
|
|
-NO_EXPRECISION = 1
|
|
-
|
|
-INTERIX_TOOL_DIR = /opt/gcc.3.3/i586-pc-interix3/bin
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), CYGWIN_NT)
|
|
-NEED_PIC = 0
|
|
-NO_EXPRECISION = 1
|
|
-OS_CYGWIN_NT = 1
|
|
-endif
|
|
-
|
|
-ifneq ($(OSNAME), WINNT)
|
|
-ifneq ($(OSNAME), CYGWIN_NT)
|
|
-ifneq ($(OSNAME), Interix)
|
|
-ifneq ($(OSNAME), Android)
|
|
-ifdef SMP
|
|
-EXTRALIB += -lpthread
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-# ifeq logical or
|
|
-ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
|
|
-OS_WINDOWS=1
|
|
-endif
|
|
-
|
|
-ifdef QUAD_PRECISION
|
|
-CCOMMON_OPT += -DQUAD_PRECISION
|
|
-NO_EXPRECISION = 1
|
|
-endif
|
|
-
|
|
-ifneq ($(ARCH), x86)
|
|
-ifneq ($(ARCH), x86_64)
|
|
-NO_EXPRECISION = 1
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifdef UTEST_CHECK
|
|
-CCOMMON_OPT += -DUTEST_CHECK
|
|
-SANITY_CHECK = 1
|
|
-endif
|
|
-
|
|
-ifdef SANITY_CHECK
|
|
-CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
|
|
-endif
|
|
-
|
|
-MAX_STACK_ALLOC ?= 2048
|
|
-ifneq ($(MAX_STACK_ALLOC), 0)
|
|
-CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
|
-endif
|
|
-
|
|
-ifdef USE_LOCKING
|
|
-ifneq ($(USE_LOCKING), 0)
|
|
-CCOMMON_OPT += -DUSE_LOCKING
|
|
-endif
|
|
-endif
|
|
-
|
|
-#
|
|
-# Architecture dependent settings
|
|
-#
|
|
-
|
|
-ifeq ($(ARCH), x86)
|
|
-ifndef BINARY
|
|
-NO_BINARY_MODE = 1
|
|
-endif
|
|
-
|
|
-ifeq ($(CORE), generic)
|
|
-NO_EXPRECISION = 1
|
|
-endif
|
|
-
|
|
-ifndef NO_EXPRECISION
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
-# ifeq logical or. GCC or LSB
|
|
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
|
-EXPRECISION = 1
|
|
-CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
|
|
-FCOMMON_OPT += -m128bit-long-double
|
|
-endif
|
|
-ifeq ($(C_COMPILER), CLANG)
|
|
-EXPRECISION = 1
|
|
-CCOMMON_OPT += -DEXPRECISION
|
|
-FCOMMON_OPT += -m128bit-long-double
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), x86_64)
|
|
-
|
|
-ifeq ($(CORE), generic)
|
|
-NO_EXPRECISION = 1
|
|
-endif
|
|
-
|
|
-ifndef NO_EXPRECISION
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
-# ifeq logical or. GCC or LSB
|
|
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
|
-EXPRECISION = 1
|
|
-CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
|
|
-FCOMMON_OPT += -m128bit-long-double
|
|
-endif
|
|
-ifeq ($(C_COMPILER), CLANG)
|
|
-EXPRECISION = 1
|
|
-CCOMMON_OPT += -DEXPRECISION
|
|
-FCOMMON_OPT += -m128bit-long-double
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), INTEL)
|
|
-CCOMMON_OPT += -wd981
|
|
-endif
|
|
-
|
|
-
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-
|
|
-#check
|
|
-ifeq ($(USE_THREAD), 0)
|
|
-$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
|
|
-endif
|
|
-
|
|
-# ifeq logical or. GCC or LSB
|
|
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
|
-CCOMMON_OPT += -fopenmp
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), CLANG)
|
|
-CCOMMON_OPT += -fopenmp
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
-FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB))
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), INTEL)
|
|
-CCOMMON_OPT += -fopenmp
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), PGI)
|
|
-CCOMMON_OPT += -mp
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), OPEN64)
|
|
-CCOMMON_OPT += -mp
|
|
-CEXTRALIB += -lstdc++
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), PATHSCALE)
|
|
-CCOMMON_OPT += -mp
|
|
-endif
|
|
-endif
|
|
-
|
|
-
|
|
-ifeq ($(DYNAMIC_ARCH), 1)
|
|
-ifeq ($(ARCH), x86)
|
|
-DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
|
- CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), x86_64)
|
|
-DYNAMIC_CORE = PRESCOTT CORE2
|
|
-ifeq ($(DYNAMIC_OLDER), 1)
|
|
-DYNAMIC_CORE += PENRYN DUNNINGTON
|
|
-endif
|
|
-DYNAMIC_CORE += NEHALEM
|
|
-ifeq ($(DYNAMIC_OLDER), 1)
|
|
-DYNAMIC_CORE += OPTERON OPTERON_SSE3
|
|
-endif
|
|
-DYNAMIC_CORE += BARCELONA
|
|
-ifeq ($(DYNAMIC_OLDER), 1)
|
|
-DYNAMIC_CORE += BOBCAT ATOM NANO
|
|
-endif
|
|
-ifneq ($(NO_AVX), 1)
|
|
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
|
|
-endif
|
|
-ifneq ($(NO_AVX2), 1)
|
|
-DYNAMIC_CORE += HASWELL ZEN
|
|
-endif
|
|
-ifneq ($(NO_AVX512), 1)
|
|
-ifneq ($(NO_AVX2), 1)
|
|
-DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifdef DYNAMIC_LIST
|
|
-override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
|
|
-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
|
|
-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
|
-CCOMMON_OPT += $(XCCOMMON_OPT)
|
|
-#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), arm64)
|
|
-DYNAMIC_CORE = ARMV8
|
|
-DYNAMIC_CORE += CORTEXA53
|
|
-DYNAMIC_CORE += CORTEXA57
|
|
-DYNAMIC_CORE += CORTEXA72
|
|
-DYNAMIC_CORE += CORTEXA73
|
|
-DYNAMIC_CORE += NEOVERSEN1
|
|
-ifneq ($(NO_SVE), 1)
|
|
-DYNAMIC_CORE += NEOVERSEV1
|
|
-DYNAMIC_CORE += NEOVERSEN2
|
|
-DYNAMIC_CORE += ARMV8SVE
|
|
-endif
|
|
-DYNAMIC_CORE += CORTEXA55
|
|
-DYNAMIC_CORE += FALKOR
|
|
-DYNAMIC_CORE += THUNDERX
|
|
-DYNAMIC_CORE += THUNDERX2T99
|
|
-DYNAMIC_CORE += TSV110
|
|
-DYNAMIC_CORE += EMAG8180
|
|
-DYNAMIC_CORE += THUNDERX3T110
|
|
-ifdef DYNAMIC_LIST
|
|
-override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST)
|
|
-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8
|
|
-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), mips64)
|
|
-DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 MIPS64_GENERIC
|
|
-ifdef DYNAMIC_LIST
|
|
-override DYNAMIC_CORE = MIPS64_GENERIC $(DYNAMIC_LIST)
|
|
-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_MIPS64_GENERIC
|
|
-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), loongarch64)
|
|
-DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), zarch)
|
|
-DYNAMIC_CORE = ZARCH_GENERIC
|
|
-
|
|
-# if the compiler accepts -march=arch11 or -march=z13 and can compile a file
|
|
-# with z13-specific inline assembly, then we can include support for Z13.
|
|
-# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases
|
|
-# only support one or the other.
|
|
-# note: LLVM version 6.x supported -march=z13 yet could not handle vector
|
|
-# registers in inline assembly, so the check for supporting the -march flag is
|
|
-# not enough.
|
|
-ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null
|
|
-ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1)
|
|
-ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1)
|
|
-
|
|
-ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1)
|
|
-DYNAMIC_CORE += Z13
|
|
-CCOMMON_OPT += -DDYN_Z13
|
|
-else
|
|
-$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it)
|
|
-endif
|
|
-
|
|
-# as above for z13, check for -march=arch12 and z14 support in the compiler.
|
|
-ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1)
|
|
-ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1)
|
|
-ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1)
|
|
-DYNAMIC_CORE += Z14
|
|
-CCOMMON_OPT += -DDYN_Z14
|
|
-else
|
|
-$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it)
|
|
-endif
|
|
-
|
|
-endif # ARCH zarch
|
|
-
|
|
-ifeq ($(ARCH), power)
|
|
-ifneq ($(C_COMPILER), PGI)
|
|
-DYNAMIC_CORE = POWER6
|
|
-DYNAMIC_CORE += POWER8
|
|
-ifneq ($(C_COMPILER), GCC)
|
|
-DYNAMIC_CORE += POWER9
|
|
-DYNAMIC_CORE += POWER10
|
|
-CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
|
-endif
|
|
-ifeq ($(C_COMPILER), GCC)
|
|
-ifeq ($(GCCVERSIONGT5), 1)
|
|
-DYNAMIC_CORE += POWER9
|
|
-else
|
|
-$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
|
-endif
|
|
-ifeq ($(OSNAME), AIX)
|
|
-LDVERSIONGTEQ35 := 1
|
|
-else
|
|
-LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
|
|
-endif
|
|
-ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
|
|
-DYNAMIC_CORE += POWER10
|
|
-CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
|
-else ifeq ($(GCCVERSIONGTEQ10), 1)
|
|
-ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
|
|
-DYNAMIC_CORE += POWER10
|
|
-CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
|
-endif
|
|
-else
|
|
-$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
|
-endif
|
|
-endif
|
|
-else
|
|
-DYNAMIC_CORE = POWER8
|
|
-DYNAMIC_CORE += POWER9
|
|
-endif
|
|
-endif
|
|
-
|
|
-# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
|
-ifndef DYNAMIC_CORE
|
|
-override DYNAMIC_ARCH=
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), ia64)
|
|
-NO_BINARY_MODE = 1
|
|
-BINARY_DEFINED = 1
|
|
-
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
-ifeq ($(C_COMPILER), GCC)
|
|
-# EXPRECISION = 1
|
|
-# CCOMMON_OPT += -DEXPRECISION
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
|
|
-NO_BINARY_MODE = 1
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), alpha)
|
|
-NO_BINARY_MODE = 1
|
|
-BINARY_DEFINED = 1
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), arm)
|
|
-NO_BINARY_MODE = 1
|
|
-BINARY_DEFINED = 1
|
|
-
|
|
-CCOMMON_OPT += -marm
|
|
-FCOMMON_OPT += -marm
|
|
-
|
|
-# If softfp abi is mentioned on the command line, force it.
|
|
-ifeq ($(ARM_SOFTFP_ABI), 1)
|
|
-CCOMMON_OPT += -mfloat-abi=softfp
|
|
-FCOMMON_OPT += -mfloat-abi=softfp
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), Android)
|
|
-ifeq ($(ARM_SOFTFP_ABI), 1)
|
|
-EXTRALIB += -lm
|
|
-else
|
|
-EXTRALIB += -Wl,-lm_hard
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), arm64)
|
|
-NO_BINARY_MODE = 1
|
|
-BINARY_DEFINED = 1
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
-FCOMMON_OPT += -fdefault-integer-8
|
|
-endif
|
|
-ifeq ($(F_COMPILER), FLANG)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), riscv64)
|
|
-NO_BINARY_MODE = 1
|
|
-BINARY_DEFINED = 1
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
-FCOMMON_OPT += -fdefault-integer-8
|
|
-endif
|
|
-ifeq ($(F_COMPILER), FLANG)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), loongarch64)
|
|
-NO_BINARY_MODE = 1
|
|
-BINARY_DEFINED = 1
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
-FCOMMON_OPT += -fdefault-integer-8
|
|
-endif
|
|
-ifeq ($(F_COMPILER), FLANG)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-#
|
|
-# C Compiler dependent settings
|
|
-#
|
|
-
|
|
-
|
|
-# ifeq logical or. GCC or CLANG or LSB
|
|
-# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
|
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB))
|
|
-CCOMMON_OPT += -Wall
|
|
-COMMON_PROF += -fno-inline
|
|
-NO_UNINITIALIZED_WARN = -Wno-uninitialized
|
|
-
|
|
-ifeq ($(QUIET_MAKE), 1)
|
|
-CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused
|
|
-endif
|
|
-
|
|
-ifdef NO_BINARY_MODE
|
|
-
|
|
-ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
|
-ifdef BINARY64
|
|
-CCOMMON_OPT += -mabi=64
|
|
-else
|
|
-CCOMMON_OPT += -mabi=n32
|
|
-endif
|
|
-BINARY_DEFINED = 1
|
|
-else ifeq ($(ARCH), $(filter $(ARCH),mips))
|
|
-CCOMMON_OPT += -mabi=32
|
|
-BINARY_DEFINED = 1
|
|
-endif
|
|
-
|
|
-ifneq (, $(filter $(CORE), MIPS64_GENERIC))
|
|
-CCOMMON_OPT += -DNO_MSA
|
|
-FCOMMON_OPT += -DNO_MSA
|
|
-endif
|
|
-
|
|
-ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
|
-CCOMMON_OPT += -march=loongson3a
|
|
-FCOMMON_OPT += -march=loongson3a
|
|
-endif
|
|
-
|
|
-ifeq ($(CORE), MIPS24K)
|
|
-CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS)
|
|
-FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS)
|
|
-endif
|
|
-
|
|
-ifeq ($(CORE), MIPS1004K)
|
|
-CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
|
-FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
|
-endif
|
|
-
|
|
-ifeq ($(CORE), P5600)
|
|
-CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
|
-FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
|
-endif
|
|
-
|
|
-ifeq ($(CORE), I6400)
|
|
-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
|
|
-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
|
|
-endif
|
|
-
|
|
-ifeq ($(CORE), P6600)
|
|
-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
|
|
-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
|
|
-endif
|
|
-
|
|
-ifeq ($(CORE), I6500)
|
|
-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
|
|
-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), AIX)
|
|
-BINARY_DEFINED = 1
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), loongarch64)
|
|
-LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
|
|
-ifneq ($(LA64_ABI), lp64d)
|
|
-LA64_ABI=lp64
|
|
-endif
|
|
-CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
|
-FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
|
-endif
|
|
-
|
|
-endif
|
|
-
|
|
-ifndef BINARY_DEFINED
|
|
-ifneq ($(OSNAME), AIX)
|
|
-ifdef BINARY64
|
|
-ifneq ($(ARCH), riscv64)
|
|
-CCOMMON_OPT += -m64
|
|
-endif
|
|
-else
|
|
-CCOMMON_OPT += -m32
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), PGI)
|
|
-PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
|
|
-PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20)
|
|
-PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11)
|
|
-PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
|
|
-ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011))
|
|
-NEWPGI := 1
|
|
-PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21)
|
|
-PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21)
|
|
-PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11)
|
|
-ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011))
|
|
-NEWPGI2 := 1
|
|
-endif
|
|
-endif
|
|
-ifdef BINARY64
|
|
-ifeq ($(ARCH), x86_64)
|
|
-ifeq (,$(findstring tp,$(CFLAGS)))
|
|
-ifneq ($(NEWPGI2),1)
|
|
-CCOMMON_OPT += -tp p7-64
|
|
-else
|
|
-CCOMMON_OPT += -tp px
|
|
-endif
|
|
-endif
|
|
-ifneq ($(NEWPGI),1)
|
|
-CCOMMON_OPT += -D__MMX__ -Mnollvm
|
|
-endif
|
|
-else
|
|
-ifeq ($(ARCH), power)
|
|
-ifeq (,$(findstring tp,$(CFLAGS)))
|
|
-ifeq ($(CORE), POWER8)
|
|
-CCOMMON_OPT += -tp pwr8
|
|
-endif
|
|
-ifeq ($(CORE), POWER9)
|
|
-CCOMMON_OPT += -tp pwr9
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-else
|
|
-ifneq ($(NEWPGI2),1)
|
|
-ifeq (,$(findstring tp,$(CFLAGS)))
|
|
-CCOMMON_OPT += -tp p7
|
|
-else
|
|
-CCOMMON_OPT += -tp px
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), PATHSCALE)
|
|
-ifdef BINARY64
|
|
-CCOMMON_OPT += -m64
|
|
-else
|
|
-CCOMMON_OPT += -m32
|
|
-endif
|
|
-endif
|
|
-
|
|
-#
|
|
-# Fortran Compiler dependent settings
|
|
-#
|
|
-
|
|
-ifeq ($(F_COMPILER), NAG)
|
|
-FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -openmp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), FLANG)
|
|
-CCOMMON_OPT += -DF_INTERFACE_FLANG
|
|
-FCOMMON_OPT += -Mrecursive -Kieee
|
|
-ifeq ($(OSNAME), Linux)
|
|
-ifeq ($(ARCH), x86_64)
|
|
-FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
|
|
-ifeq ($(FLANG_VENDOR), AMD)
|
|
-FCOMMON_OPT += -fno-unroll-loops
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-ifdef BINARY64
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-FCOMMON_OPT += -Wall
|
|
-else
|
|
-FCOMMON_OPT += -Wall
|
|
-endif
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -fopenmp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), G77)
|
|
-CCOMMON_OPT += -DF_INTERFACE_G77
|
|
-FCOMMON_OPT += -Wall
|
|
-ifndef NO_BINARY_MODE
|
|
-ifneq ($(OSNAME), AIX)
|
|
-ifdef BINARY64
|
|
-FCOMMON_OPT += -m64
|
|
-else
|
|
-FCOMMON_OPT += -m32
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), G95)
|
|
-CCOMMON_OPT += -DF_INTERFACE_G95
|
|
-FCOMMON_OPT += -Wall
|
|
-ifneq ($(OSNAME), AIX)
|
|
-ifndef NO_BINARY_MODE
|
|
-ifdef BINARY64
|
|
-FCOMMON_OPT += -m64
|
|
-else
|
|
-FCOMMON_OPT += -m32
|
|
-endif
|
|
-endif
|
|
-ifneq ($(NO_LAPACKE), 1)
|
|
-FCOMMON_OPT += -fno-second-underscore
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
|
|
-CCOMMON_OPT += -DF_INTERFACE_GFORT
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
-FCOMMON_OPT += -Wall
|
|
-# make single-threaded LAPACK calls thread-safe #1847
|
|
-FCOMMON_OPT += -frecursive
|
|
-# work around ABI problem with passing single-character arguments
|
|
-FCOMMON_OPT += -fno-optimize-sibling-calls
|
|
-#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
|
-ifneq ($(NOFORTRAN), 1)
|
|
-ifneq ($(NOFORTRAN), 2)
|
|
-ifneq ($(NO_LAPACK), 1)
|
|
-EXTRALIB += -lgfortran
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-ifdef NO_BINARY_MODE
|
|
-ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
|
-ifdef BINARY64
|
|
-FCOMMON_OPT += -mabi=64
|
|
-else
|
|
-FCOMMON_OPT += -mabi=n32
|
|
-endif
|
|
-else ifeq ($(ARCH), $(filter $(ARCH),mips))
|
|
-FCOMMON_OPT += -mabi=32
|
|
-endif
|
|
-else
|
|
-ifdef BINARY64
|
|
-ifneq ($(OSNAME), AIX)
|
|
-ifneq ($(ARCH), riscv64)
|
|
-FCOMMON_OPT += -m64
|
|
-endif
|
|
-endif
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -fdefault-integer-8
|
|
-endif
|
|
-endif
|
|
-else
|
|
-ifneq ($(OSNAME), AIX)
|
|
-FCOMMON_OPT += -m32
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -fopenmp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), INTEL)
|
|
-CCOMMON_OPT += -DF_INTERFACE_INTEL
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -fopenmp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), FUJITSU)
|
|
-CCOMMON_OPT += -DF_INTERFACE_FUJITSU
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -openmp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), IBM)
|
|
-CCOMMON_OPT += -DF_INTERFACE_IBM
|
|
-FEXTRALIB += -lxlf90
|
|
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
|
|
-FCOMMON_OPT += -qextname
|
|
-endif
|
|
-# FCOMMON_OPT += -qarch=440
|
|
-ifdef BINARY64
|
|
-FCOMMON_OPT += -q64
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -qintsize=8
|
|
-endif
|
|
-endif
|
|
-else
|
|
-FCOMMON_OPT += -q32
|
|
-endif
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -openmp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), PGI)
|
|
-CCOMMON_OPT += -DF_INTERFACE_PGI
|
|
-COMMON_PROF += -DPGICOMPILER
|
|
-ifdef BINARY64
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-ifeq ($(ARCH), x86_64)
|
|
-ifneq ($(NEWPGI2),1)
|
|
-FCOMMON_OPT += -tp p7-64
|
|
-else
|
|
-FCOMMON_OPT += -tp px
|
|
-endif
|
|
-else
|
|
-ifeq ($(ARCH), power)
|
|
-ifeq ($(CORE), POWER6)
|
|
-$(warning NVIDIA HPC compilers do not support POWER6.)
|
|
-endif
|
|
-ifeq ($(CORE), POWER8)
|
|
-FCOMMON_OPT += -tp pwr8
|
|
-endif
|
|
-ifeq ($(CORE), POWER9)
|
|
-FCOMMON_OPT += -tp pwr9
|
|
-endif
|
|
-ifeq ($(CORE), POWER10)
|
|
-$(warning NVIDIA HPC compilers do not support POWER10.)
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-else
|
|
-FCOMMON_OPT += -tp p7
|
|
-endif
|
|
-FCOMMON_OPT += -Mrecursive -Kieee
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -mp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), PATHSCALE)
|
|
-CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
|
|
-ifdef BINARY64
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -mp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), OPEN64)
|
|
-CCOMMON_OPT += -DF_INTERFACE_OPEN64
|
|
-ifdef BINARY64
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -i8
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
|
|
-ifndef BINARY64
|
|
-FCOMMON_OPT += -n32
|
|
-else
|
|
-FCOMMON_OPT += -n64
|
|
-endif
|
|
-ifeq ($(CORE), LOONGSON3R3)
|
|
-FCOMMON_OPT += -loongson3 -static
|
|
-endif
|
|
-ifeq ($(CORE), LOONGSON3R4)
|
|
-FCOMMON_OPT += -loongson3 -static
|
|
-endif
|
|
-else
|
|
-ifndef BINARY64
|
|
-FCOMMON_OPT += -m32
|
|
-else
|
|
-FCOMMON_OPT += -m64
|
|
-endif
|
|
-endif
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FEXTRALIB += -lstdc++
|
|
-FCOMMON_OPT += -mp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), OPEN64)
|
|
-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
|
|
-ifndef BINARY64
|
|
-CCOMMON_OPT += -n32
|
|
-else
|
|
-CCOMMON_OPT += -n64
|
|
-endif
|
|
-ifeq ($(CORE), LOONGSON3R3)
|
|
-CCOMMON_OPT += -loongson3 -static
|
|
-endif
|
|
-ifeq ($(CORE), LOONGSON3R4)
|
|
-CCOMMON_OPT += -loongson3 -static
|
|
-endif
|
|
-else
|
|
-ifndef BINARY64
|
|
-CCOMMON_OPT += -m32
|
|
-else
|
|
-CCOMMON_OPT += -m64
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(C_COMPILER), SUN)
|
|
-CCOMMON_OPT += -w
|
|
-ifeq ($(ARCH), x86)
|
|
-CCOMMON_OPT += -m32
|
|
-else
|
|
-ifdef BINARY64
|
|
-CCOMMON_OPT += -m64
|
|
-else
|
|
-CCOMMON_OPT += -m32
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), SUN)
|
|
-CCOMMON_OPT += -DF_INTERFACE_SUN
|
|
-FCOMMON_OPT += -ftrap=%none -xrecursive
|
|
-ifeq ($(ARCH), x86)
|
|
-FCOMMON_OPT += -m32
|
|
-else
|
|
-ifdef BINARY64
|
|
-FCOMMON_OPT += -m64
|
|
-else
|
|
-FCOMMON_OPT += -m32
|
|
-endif
|
|
-endif
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -xopenmp=parallel
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), COMPAQ)
|
|
-CCOMMON_OPT += -DF_INTERFACE_COMPAQ
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -openmp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER), CRAY)
|
|
-CCOMMON_OPT += -DF_INTERFACE_INTEL
|
|
-FCOMMON_OPT += -hnopattern
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-FCOMMON_OPT += -s integer64
|
|
-endif
|
|
-endif
|
|
-ifneq ($(USE_OPENMP), 1)
|
|
-FCOMMON_OPT += -O noomp
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifdef BINARY64
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-CCOMMON_OPT +=
|
|
-#-DUSE64BITINT
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(NEED_PIC), 1)
|
|
-ifeq ($(C_COMPILER), IBM)
|
|
-CCOMMON_OPT += -qpic=large
|
|
-else
|
|
-CCOMMON_OPT += -fPIC
|
|
-endif
|
|
-ifeq ($(F_COMPILER), SUN)
|
|
-FCOMMON_OPT += -pic
|
|
-else ifeq ($(F_COMPILER), NAG)
|
|
-FCOMMON_OPT += -PIC
|
|
-else ifeq ($(F_COMPILER), IBM)
|
|
-FCOMMON_OPT += -qpic=large
|
|
-else
|
|
-FCOMMON_OPT += -fPIC
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(DYNAMIC_ARCH), 1)
|
|
-CCOMMON_OPT += -DDYNAMIC_ARCH
|
|
-endif
|
|
-
|
|
-ifeq ($(DYNAMIC_OLDER), 1)
|
|
-CCOMMON_OPT += -DDYNAMIC_OLDER
|
|
-endif
|
|
-
|
|
-ifeq ($(C_LAPACK), 1)
|
|
-CCOMMON_OPT += -DC_LAPACK
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_LAPACK), 1)
|
|
-CCOMMON_OPT += -DNO_LAPACK
|
|
-#Disable LAPACK C interface
|
|
-NO_LAPACKE = 1
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_LAPACKE), 1)
|
|
-CCOMMON_OPT += -DNO_LAPACKE
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_AVX), 1)
|
|
-CCOMMON_OPT += -DNO_AVX
|
|
-endif
|
|
-
|
|
-ifeq ($(ARCH), x86)
|
|
-CCOMMON_OPT += -DNO_AVX
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_AVX2), 1)
|
|
-CCOMMON_OPT += -DNO_AVX2
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_AVX512), 1)
|
|
-CCOMMON_OPT += -DNO_AVX512
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_SVE), 1)
|
|
-CCOMMON_OPT += -DNO_SVE
|
|
-endif
|
|
-
|
|
-ifdef SMP
|
|
-CCOMMON_OPT += -DSMP_SERVER
|
|
-
|
|
-ifeq ($(ARCH), mips64)
|
|
-USE_SIMPLE_THREADED_LEVEL3 = 1
|
|
-endif
|
|
-
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
-# USE_SIMPLE_THREADED_LEVEL3 = 1
|
|
-# NO_AFFINITY = 1
|
|
-CCOMMON_OPT += -DUSE_OPENMP
|
|
-endif
|
|
-
|
|
-ifeq ($(BIGNUMA), 1)
|
|
-CCOMMON_OPT += -DBIGNUMA
|
|
-endif
|
|
-
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_WARMUP), 1)
|
|
-CCOMMON_OPT += -DNO_WARMUP
|
|
-endif
|
|
-
|
|
-ifeq ($(CONSISTENT_FPCSR), 1)
|
|
-CCOMMON_OPT += -DCONSISTENT_FPCSR
|
|
-endif
|
|
-
|
|
-# Only for development
|
|
-# CCOMMON_OPT += -DPARAMTEST
|
|
-# CCOMMON_OPT += -DPREFETCHTEST
|
|
-# CCOMMON_OPT += -DNO_SWITCHING
|
|
-# USE_PAPI = 1
|
|
-
|
|
-ifdef USE_PAPI
|
|
-CCOMMON_OPT += -DUSE_PAPI
|
|
-EXTRALIB += -lpapi -lperfctr
|
|
-endif
|
|
-
|
|
-ifdef BUFFERSIZE
|
|
-CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE)
|
|
-endif
|
|
-
|
|
-ifdef DYNAMIC_THREADS
|
|
-CCOMMON_OPT += -DDYNAMIC_THREADS
|
|
-endif
|
|
-
|
|
-CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
|
-
|
|
-CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
|
|
-
|
|
-ifdef USE_SIMPLE_THREADED_LEVEL3
|
|
-CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
|
-endif
|
|
-
|
|
-ifeq ($(USE_TLS), 1)
|
|
-CCOMMON_OPT += -DUSE_TLS
|
|
-endif
|
|
-
|
|
-ifeq ($(BUILD_BFLOAT16), 1)
|
|
-CCOMMON_OPT += -DBUILD_BFLOAT16
|
|
-endif
|
|
-ifeq ($(BUILD_SINGLE), 1)
|
|
-CCOMMON_OPT += -DBUILD_SINGLE=1
|
|
-endif
|
|
-ifeq ($(BUILD_DOUBLE), 1)
|
|
-CCOMMON_OPT += -DBUILD_DOUBLE=1
|
|
-endif
|
|
-ifeq ($(BUILD_COMPLEX), 1)
|
|
-CCOMMON_OPT += -DBUILD_COMPLEX=1
|
|
-endif
|
|
-ifeq ($(BUILD_COMPLEX16), 1)
|
|
-CCOMMON_OPT += -DBUILD_COMPLEX16=1
|
|
-endif
|
|
-
|
|
-CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
|
|
-
|
|
-ifndef SYMBOLPREFIX
|
|
-SYMBOLPREFIX =
|
|
-endif
|
|
-
|
|
-ifndef SYMBOLSUFFIX
|
|
-SYMBOLSUFFIX =
|
|
-endif
|
|
-
|
|
-ifndef LIBSONAMEBASE
|
|
-LIBSONAMEBASE = openblas
|
|
-endif
|
|
-
|
|
-ifndef LIBNAMESUFFIX
|
|
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
|
|
-else
|
|
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), CYGWIN_NT)
|
|
-LIBPREFIX = cyg$(LIBNAMEBASE)
|
|
-else
|
|
-LIBPREFIX = lib$(LIBNAMEBASE)
|
|
-endif
|
|
-
|
|
-KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
|
-
|
|
-include $(TOPDIR)/Makefile.$(ARCH)
|
|
-
|
|
-ifneq ($(C_COMPILER), PGI)
|
|
-ifneq ($(C_COMPILER), SUN)
|
|
-CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
|
|
-endif
|
|
-endif
|
|
-CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
|
|
-
|
|
-ifeq ($(CORE), PPC440)
|
|
-CCOMMON_OPT += -DALLOC_QALLOC
|
|
-endif
|
|
-
|
|
-ifeq ($(CORE), PPC440FP2)
|
|
-STATIC_ALLOCATION = 1
|
|
-endif
|
|
-
|
|
-ifneq ($(OSNAME), Linux)
|
|
-NO_AFFINITY = 1
|
|
-endif
|
|
-
|
|
-ifneq ($(ARCH), x86_64)
|
|
-ifneq ($(ARCH), x86)
|
|
-NO_AFFINITY = 1
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifdef NO_AFFINITY
|
|
-ifeq ($(NO_AFFINITY), 0)
|
|
-override undefine NO_AFFINITY
|
|
-else
|
|
-CCOMMON_OPT += -DNO_AFFINITY
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifdef FUNCTION_PROFILE
|
|
-CCOMMON_OPT += -DFUNCTION_PROFILE
|
|
-endif
|
|
-
|
|
-ifdef HUGETLB_ALLOCATION
|
|
-CCOMMON_OPT += -DALLOC_HUGETLB
|
|
-endif
|
|
-
|
|
-ifdef HUGETLBFILE_ALLOCATION
|
|
-CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION)
|
|
-endif
|
|
-
|
|
-ifdef STATIC_ALLOCATION
|
|
-CCOMMON_OPT += -DALLOC_STATIC
|
|
-endif
|
|
-
|
|
-ifdef DEVICEDRIVER_ALLOCATION
|
|
-CCOMMON_OPT += -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\"
|
|
-endif
|
|
-
|
|
-ifdef MIXED_MEMORY_ALLOCATION
|
|
-CCOMMON_OPT += -DMIXED_MEMORY_ALLOCATION
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), SunOS)
|
|
-TAR = gtar
|
|
-PATCH = gpatch
|
|
-GREP = ggrep
|
|
-AWK = nawk
|
|
-else
|
|
-TAR = tar
|
|
-PATCH = patch
|
|
-GREP = grep
|
|
-AWK = awk
|
|
-endif
|
|
-
|
|
-ifndef MD5SUM
|
|
-MD5SUM = md5sum
|
|
-endif
|
|
-
|
|
-
|
|
-REVISION = -r$(VERSION)
|
|
-MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
|
|
-
|
|
-ifeq ($(DEBUG), 1)
|
|
-COMMON_OPT += -g
|
|
-endif
|
|
-
|
|
-ifeq ($(DEBUG), 1)
|
|
-FCOMMON_OPT += -g
|
|
-endif
|
|
-
|
|
-ifndef COMMON_OPT
|
|
-COMMON_OPT = -O2
|
|
-endif
|
|
-
|
|
-ifndef FCOMMON_OPT
|
|
-FCOMMON_OPT = -O2 -frecursive
|
|
-endif
|
|
-
|
|
-override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
|
-override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
|
-override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
|
-override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
|
-#MAKEOVERRIDES =
|
|
-
|
|
-ifeq ($(NEED_PIC), 1)
|
|
-ifeq (,$(findstring PIC,$(FFLAGS)))
|
|
-ifneq ($(F_COMPILER),IBM)
|
|
-override FFLAGS += -fPIC
|
|
-endif
|
|
-endif
|
|
-endif
|
|
-
|
|
-#For LAPACK Fortran codes.
|
|
-#Disable -fopenmp for LAPACK Fortran codes on Windows.
|
|
-ifdef OS_WINDOWS
|
|
-LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS))
|
|
-LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS))
|
|
-else
|
|
-LAPACK_FFLAGS := $(FFLAGS)
|
|
-LAPACK_FPFLAGS := $(FPFLAGS)
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER),NAG)
|
|
-LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
|
-override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
|
-endif
|
|
-ifeq ($(F_COMPILER),CRAY)
|
|
-LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
|
-override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
|
-endif
|
|
-
|
|
-LAPACK_CFLAGS = $(CFLAGS)
|
|
-LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
-LAPACK_CFLAGS += -DLAPACK_ILP64
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifdef OS_WINDOWS
|
|
-LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
|
|
-LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
|
|
-endif
|
|
-ifeq ($(C_COMPILER), LSB)
|
|
-LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
|
|
-endif
|
|
-
|
|
-ifndef SUFFIX
|
|
-SUFFIX = o
|
|
-endif
|
|
-
|
|
-ifndef PSUFFIX
|
|
-PSUFFIX = po
|
|
-endif
|
|
-
|
|
-ifndef LIBSUFFIX
|
|
-LIBSUFFIX = a
|
|
-endif
|
|
-
|
|
-ifneq ($(DYNAMIC_ARCH), 1)
|
|
-ifndef SMP
|
|
-LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX)
|
|
-LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX)
|
|
-else
|
|
-LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX)
|
|
-LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX)
|
|
-endif
|
|
-else
|
|
-ifndef SMP
|
|
-LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX)
|
|
-LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX)
|
|
-else
|
|
-LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX)
|
|
-LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
|
|
-endif
|
|
-endif
|
|
-
|
|
-
|
|
-LIBDLLNAME = $(LIBPREFIX).dll
|
|
-IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
|
-ifneq ($(OSNAME), AIX)
|
|
-LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
|
-else
|
|
-LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
|
|
-endif
|
|
-LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
|
-LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
|
-LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
|
-LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip)
|
|
-
|
|
-LIBS = $(TOPDIR)/$(LIBNAME)
|
|
-LIBS_P = $(TOPDIR)/$(LIBNAME_P)
|
|
-
|
|
-
|
|
-LIB_COMPONENTS = BLAS
|
|
-ifneq ($(NO_CBLAS), 1)
|
|
-LIB_COMPONENTS += CBLAS
|
|
-endif
|
|
-
|
|
-ifneq ($(NO_LAPACK), 1)
|
|
-LIB_COMPONENTS += LAPACK
|
|
-ifneq ($(NO_LAPACKE), 1)
|
|
-LIB_COMPONENTS += LAPACKE
|
|
-endif
|
|
-ifeq ($(BUILD_RELAPACK), 1)
|
|
-LIB_COMPONENTS += ReLAPACK
|
|
-endif
|
|
-endif
|
|
-
|
|
-ifeq ($(ONLY_CBLAS), 1)
|
|
-LIB_COMPONENTS = CBLAS
|
|
-endif
|
|
-
|
|
-export OSNAME
|
|
-export ARCH
|
|
-export CORE
|
|
-export LIBCORE
|
|
-export __BYTE_ORDER__
|
|
-export ELF_VERSION
|
|
-export PGCPATH
|
|
-export CONFIG
|
|
-export CC
|
|
-export FC
|
|
-export BU
|
|
-export FU
|
|
-export NEED2UNDERSCORES
|
|
-export USE_THREAD
|
|
-export NUM_THREADS
|
|
-export NUM_CORES
|
|
-export SMP
|
|
-export MAKEFILE_RULE
|
|
-export NEED_PIC
|
|
-export BINARY
|
|
-export BINARY32
|
|
-export BINARY64
|
|
-export F_COMPILER
|
|
-export C_COMPILER
|
|
-export USE_OPENMP
|
|
-export CROSS
|
|
-export CROSS_SUFFIX
|
|
-export NOFORTRAN
|
|
-export C_LAPACK
|
|
-export NO_FBLAS
|
|
-export EXTRALIB
|
|
-export CEXTRALIB
|
|
-export FEXTRALIB
|
|
-export HAVE_SSE
|
|
-export HAVE_SSE2
|
|
-export HAVE_SSE3
|
|
-export HAVE_SSSE3
|
|
-export HAVE_SSE4_1
|
|
-export HAVE_SSE4_2
|
|
-export HAVE_SSE4A
|
|
-export HAVE_SSE5
|
|
-export HAVE_AVX
|
|
-export HAVE_AVX2
|
|
-export HAVE_FMA3
|
|
-export HAVE_VFP
|
|
-export HAVE_VFPV3
|
|
-export HAVE_VFPV4
|
|
-export HAVE_NEON
|
|
-ifndef NO_MSA
|
|
- export HAVE_MSA
|
|
- export MSA_FLAGS
|
|
-endif
|
|
-export KERNELDIR
|
|
-export FUNCTION_PROFILE
|
|
-export TARGET_CORE
|
|
-export NO_AVX512
|
|
-export NO_AVX2
|
|
-export BUILD_BFLOAT16
|
|
-export NO_LSX
|
|
-export NO_LASX
|
|
-
|
|
-export SBGEMM_UNROLL_M
|
|
-export SBGEMM_UNROLL_N
|
|
-export SGEMM_UNROLL_M
|
|
-export SGEMM_UNROLL_N
|
|
-export DGEMM_UNROLL_M
|
|
-export DGEMM_UNROLL_N
|
|
-export QGEMM_UNROLL_M
|
|
-export QGEMM_UNROLL_N
|
|
-export CGEMM_UNROLL_M
|
|
-export CGEMM_UNROLL_N
|
|
-export ZGEMM_UNROLL_M
|
|
-export ZGEMM_UNROLL_N
|
|
-export XGEMM_UNROLL_M
|
|
-export XGEMM_UNROLL_N
|
|
-export CGEMM3M_UNROLL_M
|
|
-export CGEMM3M_UNROLL_N
|
|
-export ZGEMM3M_UNROLL_M
|
|
-export ZGEMM3M_UNROLL_N
|
|
-export XGEMM3M_UNROLL_M
|
|
-export XGEMM3M_UNROLL_N
|
|
-
|
|
-
|
|
-ifdef USE_CUDA
|
|
-export CUDADIR
|
|
-export CUCC
|
|
-export CUFLAGS
|
|
-export CULIB
|
|
-endif
|
|
-
|
|
-.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f
|
|
-
|
|
-.f.$(SUFFIX):
|
|
- $(FC) $(FFLAGS) -c $< -o $(@F)
|
|
-
|
|
-.f.$(PSUFFIX):
|
|
- $(FC) $(FPFLAGS) -pg -c $< -o $(@F)
|
|
-
|
|
-
|
|
-ifdef BINARY64
|
|
-PATHSCALEPATH = /opt/pathscale/lib/3.1
|
|
-PGIPATH = /opt/pgi/linux86-64/7.1-5/lib
|
|
-else
|
|
-PATHSCALEPATH = /opt/pathscale/lib/3.1/32
|
|
-PGIPATH = /opt/pgi/linux86/7.1-5/lib
|
|
-endif
|
|
-
|
|
-ACMLPATH = /opt/acml/4.3.0
|
|
-ifneq ($(OSNAME), Darwin)
|
|
-MKLPATH = /opt/intel/mkl/10.2.2.025/lib
|
|
-else
|
|
-MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib
|
|
-endif
|
|
-ATLASPATH = /opt/atlas/3.9.17/opteron
|
|
-FLAMEPATH = $(HOME)/flame/lib
|
|
-ifneq ($(OSNAME), SunOS)
|
|
-SUNPATH = /opt/sunstudio12.1
|
|
-else
|
|
-SUNPATH = /opt/SUNWspro
|
|
-endif
|
|
diff --git a/Makefile.tail b/Makefile.tail
|
|
index 54ba649..f73a86d 100644
|
|
--- a/Makefile.tail
|
|
+++ b/Makefile.tail
|
|
@@ -583,7 +583,7 @@ gen_insn_flash.c :
|
|
echo 'int i;' >> gen_insn_flash.c
|
|
echo '#ifdef __alpha' >> gen_insn_flash.c
|
|
echo 'printf(".set noat;.set noreorder;\n");' >> gen_insn_flash.c
|
|
- echo 'printf(".arch ev6;.text;.align 5\n");' >> gen_insn_flash.c
|
|
+ echo 'printf(".arch sw6;.text;.align 5\n");' >> gen_insn_flash.c
|
|
echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c
|
|
echo 'printf(".ent insn_flash\n");' >> gen_insn_flash.c
|
|
echo 'printf("insn_flash:\n");' >> gen_insn_flash.c
|
|
diff --git a/Makefile.tests b/Makefile.tests
|
|
deleted file mode 100644
|
|
index b344abc..0000000
|
|
--- a/Makefile.tests
|
|
+++ /dev/null
|
|
@@ -1,435 +0,0 @@
|
|
-TOPDIR = .
|
|
-include ./Makefile.system
|
|
-
|
|
-BLASDIRS = interface driver/level2 driver/level3 driver/others
|
|
-
|
|
-ifneq ($(DYNAMIC_ARCH), 1)
|
|
-BLASDIRS += kernel
|
|
-endif
|
|
-
|
|
-ifdef SANITY_CHECK
|
|
-BLASDIRS += reference
|
|
-endif
|
|
-
|
|
-SUBDIRS = $(BLASDIRS)
|
|
-ifneq ($(NO_LAPACK), 1)
|
|
-SUBDIRS += lapack
|
|
-endif
|
|
-
|
|
-RELA =
|
|
-ifeq ($(BUILD_RELAPACK), 1)
|
|
-RELA = re_lapack
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_FORTRAN), 1)
|
|
-define NOFORTRAN
|
|
-1
|
|
-endef
|
|
-ifneq ($(NO_LAPACK), 1)
|
|
-define C_LAPACK
|
|
-1
|
|
-endef
|
|
-endif
|
|
-export NOFORTRAN
|
|
-export NO_LAPACK
|
|
-export C_LAPACK
|
|
-endif
|
|
-
|
|
-ifeq ($(F_COMPILER),CRAY)
|
|
-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS))
|
|
-else
|
|
-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
|
|
-endif
|
|
-
|
|
-SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
|
-
|
|
-.PHONY : all libs netlib $(RELA) test ctest shared install
|
|
-.NOTPARALLEL : shared
|
|
-
|
|
-all :: tests
|
|
- @echo
|
|
- @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
|
- @echo
|
|
- @echo " OS ... $(OSNAME) "
|
|
- @echo " Architecture ... $(ARCH) "
|
|
-ifndef BINARY64
|
|
- @echo " BINARY ... 32bit "
|
|
-else
|
|
- @echo " BINARY ... 64bit "
|
|
-endif
|
|
-
|
|
-ifdef INTERFACE64
|
|
-ifneq ($(INTERFACE64), 0)
|
|
- @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
|
|
-endif
|
|
-endif
|
|
- @$(CC) --version > /dev/null 2>&1;\
|
|
- if [ $$? -eq 0 ]; then \
|
|
- cverinfo=`$(CC) --version | sed -n '1p'`; \
|
|
- if [ -z "$${cverinfo}" ]; then \
|
|
- cverinfo=`$(CC) --version | sed -n '2p'`; \
|
|
- fi; \
|
|
- echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
|
|
- else \
|
|
- echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
|
|
- fi
|
|
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|
- @$(FC) --version > /dev/null 2>&1;\
|
|
- if [ $$? -eq 0 ]; then \
|
|
- fverinfo=`$(FC) --version | sed -n '1p'`; \
|
|
- if [ -z "$${fverinfo}" ]; then \
|
|
- fverinfo=`$(FC) --version | sed -n '2p'`; \
|
|
- fi; \
|
|
- echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
|
|
- else \
|
|
- echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
|
|
- fi
|
|
-endif
|
|
-ifneq ($(OSNAME), AIX)
|
|
- @echo -n " Library Name ... $(LIBNAME)"
|
|
-else
|
|
- @echo " Library Name ... $(LIBNAME)"
|
|
-endif
|
|
-
|
|
-ifndef SMP
|
|
- @echo " (Single-threading) "
|
|
-else
|
|
- @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))"
|
|
-endif
|
|
-
|
|
-ifeq ($(DYNAMIC_ARCH), 1)
|
|
- @echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)"
|
|
-endif
|
|
-
|
|
-ifeq ($(USE_OPENMP), 1)
|
|
- @echo
|
|
- @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, "
|
|
- @echo " you should use OMP_NUM_THREADS environment variable to control the number of threads."
|
|
- @echo
|
|
-endif
|
|
-
|
|
-ifeq ($(OSNAME), Darwin)
|
|
- @echo "WARNING: If you plan to use the dynamic library $(LIBDYNNAME), you must run:"
|
|
- @echo
|
|
- @echo "\"make PREFIX=/your_installation_path/ install\"."
|
|
- @echo
|
|
- @echo "(or set PREFIX in Makefile.rule and run make install."
|
|
- @echo
|
|
- @echo "Note that any flags passed to make during build should also be passed to make install"
|
|
- @echo "to circumvent any install errors."
|
|
- @echo
|
|
- @echo "If you want to move the .dylib to a new location later, make sure you change"
|
|
- @echo "the internal name of the dylib with:"
|
|
- @echo
|
|
- @echo "install_name_tool -id /new/absolute/path/to/$(LIBDYNNAME) $(LIBDYNNAME)"
|
|
-endif
|
|
- @echo
|
|
- @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"."
|
|
- @echo
|
|
- @echo "Note that any flags passed to make during build should also be passed to make install"
|
|
- @echo "to circumvent any install errors."
|
|
- @echo
|
|
-
|
|
-shared : libs netlib $(RELA)
|
|
-ifneq ($(NO_SHARED), 1)
|
|
-ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
|
- @$(MAKE) -C exports so
|
|
- @ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
|
- @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
|
-endif
|
|
-ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
|
- @$(MAKE) -C exports so
|
|
- @ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
|
-endif
|
|
-ifeq ($(OSNAME), Darwin)
|
|
- @$(MAKE) -C exports dyn
|
|
- @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
|
- @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
|
-endif
|
|
-ifeq ($(OSNAME), WINNT)
|
|
- @$(MAKE) -C exports dll
|
|
-endif
|
|
-ifeq ($(OSNAME), CYGWIN_NT)
|
|
- @$(MAKE) -C exports dll
|
|
-endif
|
|
-endif
|
|
-
|
|
-tests : shared
|
|
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|
- touch $(LIBNAME)
|
|
-ifndef NO_FBLAS
|
|
- $(MAKE) -C test all
|
|
-endif
|
|
-endif
|
|
-ifneq ($(ONLY_CBLAS), 1)
|
|
- $(MAKE) -C utest all
|
|
-endif
|
|
-ifneq ($(NO_CBLAS), 1)
|
|
-ifneq ($(ONLY_CBLAS), 1)
|
|
- $(MAKE) -C ctest all
|
|
-endif
|
|
-ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
|
- $(MAKE) -C cpp_thread_test all
|
|
-endif
|
|
-endif
|
|
-
|
|
-libs :
|
|
-ifeq ($(CORE), UNKNOWN)
|
|
- $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
|
|
-endif
|
|
-ifeq ($(NOFORTRAN), 1)
|
|
- $(info OpenBLAS: Detecting fortran compiler failed. Can only compile BLAS and f2c-converted LAPACK.)
|
|
-endif
|
|
-ifeq ($(NO_STATIC), 1)
|
|
-ifeq ($(NO_SHARED), 1)
|
|
- $(error OpenBLAS: neither static nor shared are enabled.)
|
|
-endif
|
|
-endif
|
|
- @for d in $(SUBDIRS) ; \
|
|
- do if test -d $$d; then \
|
|
- $(MAKE) -C $$d $(@F) || exit 1 ; \
|
|
- fi; \
|
|
- done
|
|
-#Save the config files for installation
|
|
- @cp Makefile.conf Makefile.conf_last
|
|
- @cp config.h config_last.h
|
|
-ifdef QUAD_PRECISION
|
|
- @echo "#define QUAD_PRECISION">> config_last.h
|
|
-endif
|
|
-ifeq ($(EXPRECISION), 1)
|
|
- @echo "#define EXPRECISION">> config_last.h
|
|
-endif
|
|
-##
|
|
-ifeq ($(DYNAMIC_ARCH), 1)
|
|
- @$(MAKE) -C kernel commonlibs || exit 1
|
|
- @for d in $(DYNAMIC_CORE) ; \
|
|
- do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
|
- done
|
|
- @echo DYNAMIC_ARCH=1 >> Makefile.conf_last
|
|
-ifeq ($(DYNAMIC_OLDER), 1)
|
|
- @echo DYNAMIC_OLDER=1 >> Makefile.conf_last
|
|
-endif
|
|
-endif
|
|
- @echo TARGET=$(CORE) >> Makefile.conf_last
|
|
-ifdef USE_THREAD
|
|
- @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
|
-endif
|
|
-ifdef SMP
|
|
-ifdef NUM_THREADS
|
|
- @echo NUM_THREADS=$(NUM_THREADS) >> Makefile.conf_last
|
|
-else
|
|
- @echo NUM_THREADS=$(NUM_CORES) >> Makefile.conf_last
|
|
-endif
|
|
-endif
|
|
-ifeq ($(USE_OPENMP),1)
|
|
- @echo USE_OPENMP=1 >> Makefile.conf_last
|
|
-endif
|
|
-ifeq ($(INTERFACE64),1)
|
|
- @echo INTERFACE64=1 >> Makefile.conf_last
|
|
-endif
|
|
- @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last
|
|
- @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last
|
|
- @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
|
- @touch lib.grd
|
|
-
|
|
-prof : prof_blas prof_lapack
|
|
-
|
|
-prof_blas :
|
|
- ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
|
- for d in $(SUBDIRS) ; \
|
|
- do if test -d $$d; then \
|
|
- $(MAKE) -C $$d prof || exit 1 ; \
|
|
- fi; \
|
|
- done
|
|
-ifeq ($(DYNAMIC_ARCH), 1)
|
|
- $(MAKE) -C kernel commonprof || exit 1
|
|
-endif
|
|
-
|
|
-blas :
|
|
- ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
|
- for d in $(BLASDIRS) ; \
|
|
- do if test -d $$d; then \
|
|
- $(MAKE) -C $$d libs || exit 1 ; \
|
|
- fi; \
|
|
- done
|
|
-
|
|
-hpl :
|
|
- ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
|
- for d in $(BLASDIRS) ../laswp exports ; \
|
|
- do if test -d $$d; then \
|
|
- $(MAKE) -C $$d $(@F) || exit 1 ; \
|
|
- fi; \
|
|
- done
|
|
-ifeq ($(DYNAMIC_ARCH), 1)
|
|
- $(MAKE) -C kernel commonlibs || exit 1
|
|
- for d in $(DYNAMIC_CORE) ; \
|
|
- do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
|
- done
|
|
-endif
|
|
-
|
|
-hpl_p :
|
|
- ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
|
- for d in $(SUBDIRS) ../laswp exports ; \
|
|
- do if test -d $$d; then \
|
|
- $(MAKE) -C $$d $(@F) || exit 1 ; \
|
|
- fi; \
|
|
- done
|
|
-
|
|
-netlib : lapack_prebuild
|
|
-ifneq ($(NO_LAPACK), 1)
|
|
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
|
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
|
-endif
|
|
-ifneq ($(NO_LAPACKE), 1)
|
|
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
|
|
-endif
|
|
-
|
|
-ifeq ($(NO_LAPACK), 1)
|
|
-re_lapack :
|
|
-
|
|
-else
|
|
-re_lapack :
|
|
- @$(MAKE) -C relapack
|
|
-endif
|
|
-
|
|
-prof_lapack : lapack_prebuild
|
|
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
|
-
|
|
-lapack_prebuild :
|
|
-ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK)))
|
|
- -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
- -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-else
|
|
- -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
- -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
|
|
- -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-else
|
|
- -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
- -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-ifeq ($(F_COMPILER), GFORTRAN)
|
|
- -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-ifdef SMP
|
|
-ifeq ($(OSNAME), WINNT)
|
|
- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-else ifeq ($(OSNAME), Haiku)
|
|
- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-else
|
|
- -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
-else
|
|
- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
-else
|
|
- -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
-ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
|
|
- -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
-ifeq ($(BUILD_SINGLE), 1)
|
|
- -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
-ifeq ($(BUILD_DOUBLE), 1)
|
|
- -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
-ifeq ($(BUILD_COMPLEX), 1)
|
|
- -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
-ifeq ($(BUILD_COMPLEX16), 1)
|
|
- -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
- -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
- -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
|
|
-endif
|
|
-
|
|
-large.tgz :
|
|
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|
- if [ ! -a $< ]; then
|
|
- -wget http://www.netlib.org/lapack/timing/large.tgz;
|
|
- fi
|
|
-endif
|
|
-
|
|
-timing.tgz :
|
|
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|
- if [ ! -a $< ]; then
|
|
- -wget http://www.netlib.org/lapack/timing/timing.tgz;
|
|
- fi
|
|
-endif
|
|
-
|
|
-lapack-timing : large.tgz timing.tgz
|
|
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|
- (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
|
|
- (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
|
|
- $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
|
|
-endif
|
|
-
|
|
-
|
|
-lapack-test :
|
|
- (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
|
- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
|
|
- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
|
-ifneq ($(CROSS), 1)
|
|
- ( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \
|
|
- ./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
|
- (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
|
-endif
|
|
-
|
|
-lapack-runtest: lapack-test
|
|
- ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
|
- ./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
|
- (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
|
|
-
|
|
-
|
|
-blas-test:
|
|
- (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
|
- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
|
- (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
|
-
|
|
-
|
|
-dummy :
|
|
-
|
|
-install :
|
|
- $(MAKE) -f Makefile.install install
|
|
-
|
|
-clean ::
|
|
- @for d in $(SUBDIRS_ALL) ; \
|
|
- do if test -d $$d; then \
|
|
- $(MAKE) -C $$d $(@F) || exit 1 ; \
|
|
- fi; \
|
|
- done
|
|
-#ifdef DYNAMIC_ARCH
|
|
- @$(MAKE) -C kernel clean
|
|
-#endif
|
|
- @$(MAKE) -C reference clean
|
|
- @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0
|
|
-ifeq ($(OSNAME), Darwin)
|
|
- @rm -rf getarch.dSYM getarch_2nd.dSYM
|
|
-endif
|
|
- @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
|
- @rm -f cblas.tmp cblas.tmp2
|
|
- @touch $(NETLIB_LAPACK_DIR)/make.inc
|
|
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
|
|
- @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
|
|
- @$(MAKE) -C relapack clean
|
|
- @rm -f *.grd Makefile.conf_last config_last.h
|
|
- @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
|
|
- @echo Done.
|
|
diff --git a/c_check b/c_check
|
|
index b018c10..13a7086 100755
|
|
--- a/c_check
|
|
+++ b/c_check
|
|
@@ -84,6 +84,7 @@ case "$data" in
|
|
*ARCH_MIPS64*) architecture=mips64 ;;
|
|
*ARCH_MIPS*) architecture=mips ;;
|
|
*ARCH_ALPHA*) architecture=alpha ;;
|
|
+ *ARCH_SW_64*) architecture=sw_64 ;;
|
|
*ARCH_SPARC*) architecture=sparc ;;
|
|
*ARCH_IA64*) architecture=ia64 ;;
|
|
*ARCH_ARM64*) architecture=arm64 ;;
|
|
@@ -124,7 +125,7 @@ case "$architecture" in
|
|
defined=1
|
|
;;
|
|
arm|arm64) defined=1 ;;
|
|
- zarch|e2k|alpha|ia64|riscv64|loonarch64)
|
|
+ zarch|e2k|alpha|ia64|riscv64|loonarch64|sw_64)
|
|
defined=1
|
|
BINARY=64
|
|
;;
|
|
@@ -232,6 +233,7 @@ case "$data" in
|
|
*ARCH_MIPS64*) architecture=mips64 ;;
|
|
*ARCH_MIPS*) architecture=mips ;;
|
|
*ARCH_ALPHA*) architecture=alpha ;;
|
|
+ *ARCH_SW_64*) architecture=sw_64 ;;
|
|
*ARCH_SPARC*) architecture=sparc ;;
|
|
*ARCH_IA64*) architecture=ia64 ;;
|
|
*ARCH_ARM64*) architecture=arm64 ;;
|
|
diff --git a/common.h b/common.h
|
|
index 4074df0..309c3f9 100644
|
|
--- a/common.h
|
|
+++ b/common.h
|
|
@@ -420,6 +420,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|
#include "common_alpha.h"
|
|
#endif
|
|
|
|
+#ifdef ARCH_SW_64
|
|
+#include "common_sw_64.h"
|
|
+#endif
|
|
+
|
|
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
|
|
#if __has_include(<cet.h>)
|
|
#include <cet.h>
|
|
diff --git a/common_sw_64.h b/common_sw_64.h
|
|
new file mode 100644
|
|
index 0000000..e14268e
|
|
--- /dev/null
|
|
+++ b/common_sw_64.h
|
|
@@ -0,0 +1,200 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#ifndef COMMON_SW_64
|
|
+#define COMMON_SW_64
|
|
+
|
|
+#ifndef ASSEMBLER
|
|
+
|
|
+#define MB asm("memb")
|
|
+#define WMB asm("memb")
|
|
+#define RMB asm("memb")
|
|
+
|
|
+static void __inline blas_lock(unsigned long *address){
|
|
+#ifndef __DECC
|
|
+ unsigned long tmp1, tmp2,tmp3;
|
|
+ asm volatile(
|
|
+ "1: ldl %1, %0\n"
|
|
+ " bne %1, 2f\n"
|
|
+ " ldi %3, %0 \n"
|
|
+ " lldl %1, 0(%3)\n"
|
|
+ " ldi %2, 1 \n"
|
|
+ " wr_f %2 \n"
|
|
+ " or %1, 1, %2\n"
|
|
+ " memb\n "
|
|
+ " lstl %2, 0(%3)\n"
|
|
+ " rd_f %2\n"
|
|
+ " bne %1, 2f\n"
|
|
+ " beq %2, 2f\n"
|
|
+ " memb\n "
|
|
+ " br $31, 3f\n"
|
|
+ "2: br $31, 1b\n"
|
|
+ "3:\n" : "=m"(*address), "=&r"(tmp1), "=&r"(tmp2),"=&r"(tmp3) : : "memory");
|
|
+#else
|
|
+ asm (
|
|
+ "10:"
|
|
+ " ldl %t0, 0(%a0); "
|
|
+ " bne %t0, 20f; "
|
|
+ " ldi %t2, %a0"
|
|
+ " lldl %t0, 0(%t2); "
|
|
+ " ldi %t1, 1"
|
|
+ " wr_f %t1"
|
|
+ " or %t0, 1, %t1;"
|
|
+ " memb; "
|
|
+ " lstl %t1, 0(%t2); "
|
|
+ " rd_f %t1"
|
|
+ " bne %t0, 20f; "
|
|
+ " beq %t1, 20f; "
|
|
+ " memb; "
|
|
+ " br %r31,30f; "
|
|
+ "20: "
|
|
+ " br %r31,10b; "
|
|
+ "30:", address);
|
|
+#endif
|
|
+}
|
|
+#define BLAS_LOCK_DEFINED
|
|
+
|
|
+static __inline unsigned int rpcc(void){
|
|
+
|
|
+ unsigned int r0;
|
|
+
|
|
+#ifndef __DECC
|
|
+ asm __volatile__("rtc %0" : "=r"(r0) : : "memory");
|
|
+#else
|
|
+ r0 = asm("rtc %v0");
|
|
+#endif
|
|
+
|
|
+ return r0;
|
|
+}
|
|
+#define RPCC_DEFINED
|
|
+
|
|
+
|
|
+#define HALT ldl $0, 0($0)
|
|
+
|
|
+#ifndef __DECC
|
|
+#define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory")
|
|
+#else
|
|
+#define GET_IMAGE(res) res = dasm("fmov $f1, %f0")
|
|
+#endif
|
|
+
|
|
+#ifdef SMP
|
|
+#ifdef USE64BITINT
|
|
+static __inline long blas_quickdivide(long x, long y){
|
|
+ return x/y;
|
|
+}
|
|
+#else
|
|
+extern unsigned int blas_quick_divide_table[];
|
|
+
|
|
+static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|
+ if (y <= 1) return x;
|
|
+ return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32);
|
|
+}
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+#define BASE_ADDRESS ((0x1b0UL << 33) | (0x1c0UL << 23) | (0x000UL << 13))
|
|
+
|
|
+#ifndef PAGESIZE
|
|
+#define PAGESIZE ( 8UL << 10)
|
|
+#define HUGE_PAGESIZE ( 4 << 20)
|
|
+#endif
|
|
+#define BUFFER_SIZE (32UL << 20)
|
|
+
|
|
+#else
|
|
+
|
|
+#ifndef F_INTERFACE
|
|
+#define REALNAME ASMNAME
|
|
+#else
|
|
+#define REALNAME ASMFNAME
|
|
+#endif
|
|
+
|
|
+#define PROLOGUE \
|
|
+ .arch sw6; \
|
|
+ .set noat; \
|
|
+ .set noreorder; \
|
|
+.text; \
|
|
+ .align 5; \
|
|
+ .globl REALNAME; \
|
|
+ .ent REALNAME; \
|
|
+REALNAME:
|
|
+
|
|
+#ifdef PROFILE
|
|
+#define PROFCODE \
|
|
+ ldgp $gp, 0($27); \
|
|
+ ldi $28, _mcount; \
|
|
+ jsr $28, ($28), _mcount; \
|
|
+ .prologue 1
|
|
+#else
|
|
+#define PROFCODE .prologue 0
|
|
+#endif
|
|
+
|
|
+#if defined(__linux__) && defined(__ELF__)
|
|
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
|
|
+#else
|
|
+#define GNUSTACK
|
|
+#endif
|
|
+
|
|
+#define EPILOGUE \
|
|
+ .end REALNAME; \
|
|
+ .ident VERSION; \
|
|
+ GNUSTACK
|
|
+
|
|
+#endif
|
|
+
|
|
+#ifdef DOUBLE
|
|
+#define SXADDQ s8addl
|
|
+#define SXSUBL s8subl
|
|
+#define LD fldd
|
|
+#define ST fstd
|
|
+#define STQ stq
|
|
+#define ADD faddd
|
|
+#define SUB fsubd
|
|
+#define MUL fmuld
|
|
+#define DIV fdivd
|
|
+#else
|
|
+#define SXADDQ s4addl
|
|
+#define SXSUBL s4subl
|
|
+#define LD flds
|
|
+#define ST fsts
|
|
+#define STQ stl
|
|
+#define ADD fadds
|
|
+#define SUB fsubs
|
|
+#define MUL fmuls
|
|
+#define DIV fdivs
|
|
+#endif
|
|
+#endif
|
|
diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile
|
|
index be8313e..1ab9bb8 100644
|
|
--- a/cpp_thread_test/Makefile
|
|
+++ b/cpp_thread_test/Makefile
|
|
@@ -1,14 +1,13 @@
|
|
-TOPDIR = ..
|
|
-include $(TOPDIR)/Makefile.system
|
|
+include ../Makefile.rule
|
|
|
|
all :: dgemv_tester dgemm_tester
|
|
|
|
dgemv_tester :
|
|
- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
|
|
+ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
|
|
./dgemv_tester
|
|
|
|
dgemm_tester : dgemv_tester
|
|
- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
|
|
+ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
|
|
./dgemm_tester
|
|
|
|
clean ::
|
|
diff --git a/cpuid_sw_64.c b/cpuid_sw_64.c
|
|
new file mode 100644
|
|
index 0000000..61ed28a
|
|
--- /dev/null
|
|
+++ b/cpuid_sw_64.c
|
|
@@ -0,0 +1,105 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#if defined(__sw_64__) && defined(__DECC)
|
|
+#include <c_asm.h>
|
|
+#endif
|
|
+
|
|
+int implver(void){
|
|
+ int arch;
|
|
+
|
|
+#ifndef __DECC
|
|
+ asm __volatile__("implver %0" : "=r"(arch) : : "memory");
|
|
+#else
|
|
+ arch = asm("implver %v0");
|
|
+#endif
|
|
+ return arch;
|
|
+}
|
|
+
|
|
+void get_architecture(void){
|
|
+ printf("SW_64");
|
|
+}
|
|
+
|
|
+void get_subarchitecture(void){
|
|
+ printf("sw%d", implver() + 4);
|
|
+}
|
|
+
|
|
+void get_subdirname(void){
|
|
+ printf("sw_64");
|
|
+}
|
|
+
|
|
+char *get_corename(void){
|
|
+ return "sw_64";
|
|
+}
|
|
+
|
|
+void get_cpuconfig(void){
|
|
+ printf("#define SW%d\n", implver() + 4);
|
|
+
|
|
+ switch (implver()){
|
|
+ case 0:
|
|
+ printf("#define L1_DATA_SIZE 16384\n");
|
|
+ printf("#define L1_DATA_LINESIZE 32\n");
|
|
+ printf("#define L2_SIZE 2097152\n");
|
|
+ printf("#define L2_LINESIZE 32\n");
|
|
+ printf("#define DTB_DEFAULT_ENTRIES 32\n");
|
|
+ printf("#define DTB_SIZE 8192\n");
|
|
+ break;
|
|
+
|
|
+ case 1:
|
|
+ printf("#define L1_DATA_SIZE 16384\n");
|
|
+ printf("#define L1_DATA_LINESIZE 32\n");
|
|
+ printf("#define L2_SIZE 2097152\n");
|
|
+ printf("#define L2_LINESIZE 64\n");
|
|
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
|
+ printf("#define DTB_SIZE 8192\n");
|
|
+ break;
|
|
+
|
|
+ case 2:
|
|
+ printf("#define L1_DATA_SIZE 32768\n");
|
|
+ printf("#define L1_DATA_LINESIZE 64\n");
|
|
+ printf("#define L2_SIZE 4194304\n");
|
|
+ printf("#define L2_LINESIZE 64\n");
|
|
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
|
+ printf("#define DTB_SIZE 8192\n");
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+void get_libname(void){
|
|
+ printf("sw%d\n", implver() + 4);
|
|
+}
|
|
diff --git a/ctest.c b/ctest.c
|
|
index 2ccae8d..6b21d3a 100644
|
|
--- a/ctest.c
|
|
+++ b/ctest.c
|
|
@@ -137,6 +137,10 @@ ARCH_MIPS
|
|
ARCH_ALPHA
|
|
#endif
|
|
|
|
+#ifdef __sw_64__
|
|
+ARCH_SW_64
|
|
+#endif
|
|
+
|
|
#if defined(__sparc) || defined(__sparc__)
|
|
ARCH_SPARC
|
|
#endif
|
|
diff --git a/getarch.c b/getarch.c
|
|
index 87384c0..306c389 100644
|
|
--- a/getarch.c
|
|
+++ b/getarch.c
|
|
@@ -1766,6 +1766,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#define OPENBLAS_SUPPORTED
|
|
#endif
|
|
|
|
+#ifdef __sw_64__
|
|
+#include "cpuid_sw_64.c"
|
|
+#define OPENBLAS_SUPPORTED
|
|
+#endif
|
|
|
|
#ifndef OPENBLAS_SUPPORTED
|
|
#error "This arch/CPU is not supported by OpenBLAS."
|
|
@@ -1831,7 +1835,7 @@ int main(int argc, char *argv[]){
|
|
#ifdef FORCE
|
|
printf("CORE=%s\n", CORENAME);
|
|
#else
|
|
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
|
|
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__sw_64__)
|
|
printf("CORE=%s\n", get_corename());
|
|
#endif
|
|
#endif
|
|
@@ -1979,7 +1983,7 @@ printf("ELF_VERSION=2\n");
|
|
#ifdef FORCE
|
|
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
|
#else
|
|
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
|
|
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__sw_64__)
|
|
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
|
#endif
|
|
#endif
|
|
diff --git a/interface/gbmv.c b/interface/gbmv.c
|
|
index 1d58ba8..18aa50e 100644
|
|
--- a/interface/gbmv.c
|
|
+++ b/interface/gbmv.c
|
|
@@ -236,7 +236,12 @@ void CNAME(enum CBLAS_ORDER order,
|
|
|
|
#ifdef SMP
|
|
} else {
|
|
-
|
|
+//ZYX20220118
|
|
+#ifndef TRANSA
|
|
+ memset(buffer, 0, nthreads*m*sizeof(FLOAT));
|
|
+#else
|
|
+ memset(buffer, 0, nthreads*n*sizeof(FLOAT));
|
|
+#endif
|
|
(gbmv_thread[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer, nthreads);
|
|
|
|
}
|
|
diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1
|
|
index 0933736..111924b 100644
|
|
--- a/kernel/Makefile.L1
|
|
+++ b/kernel/Makefile.L1
|
|
@@ -398,12 +398,16 @@ ifndef DSWAPKERNEL
|
|
DSWAPKERNEL = swap.S
|
|
endif
|
|
|
|
+#ZYX20220301
|
|
ifndef CSWAPKERNEL
|
|
-CSWAPKERNEL = zswap.S
|
|
+CSWAPKERNEL = zswap.c
|
|
+#CSWAPKERNEL = zswap.S
|
|
endif
|
|
|
|
+#ZYX20220301
|
|
ifndef ZSWAPKERNEL
|
|
-ZSWAPKERNEL = zswap.S
|
|
+ZSWAPKERNEL = zswap.c
|
|
+#ZSWAPKERNEL = zswap.S
|
|
endif
|
|
|
|
ifndef QSWAPKERNEL
|
|
diff --git a/kernel/sw_64/KERNEL b/kernel/sw_64/KERNEL
|
|
new file mode 100644
|
|
index 0000000..d10504b
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/KERNEL
|
|
@@ -0,0 +1,176 @@
|
|
+ifndef SAMINKERNEL
|
|
+SAMINKERNEL = amax.S
|
|
+endif
|
|
+
|
|
+ifndef DAMINKERNEL
|
|
+DAMINKERNEL = amax.S
|
|
+endif
|
|
+
|
|
+ifndef CAMINKERNEL
|
|
+CAMINKERNEL = zamax.S
|
|
+endif
|
|
+
|
|
+ifndef ZAMINKERNEL
|
|
+ZAMINKERNEL = zamax.S
|
|
+endif
|
|
+
|
|
+ifndef SMINKERNEL
|
|
+SMINKERNEL = max.S
|
|
+endif
|
|
+
|
|
+ifndef DMINKERNEL
|
|
+DMINKERNEL = max.S
|
|
+endif
|
|
+
|
|
+ifndef ISAMINKERNEL
|
|
+ISAMINKERNEL = iamax.S
|
|
+endif
|
|
+
|
|
+ifndef IDAMINKERNEL
|
|
+IDAMINKERNEL = iamax.S
|
|
+endif
|
|
+
|
|
+ifndef ICAMINKERNEL
|
|
+ICAMINKERNEL = izamax.S
|
|
+endif
|
|
+
|
|
+ifndef IZAMINKERNEL
|
|
+IZAMINKERNEL = izamax.S
|
|
+endif
|
|
+
|
|
+#ZYX20220301
|
|
+ifndef LSAME_KERNEL
|
|
+LSAME_KERNEL = ../generic/lsame.c
|
|
+endif
|
|
+
|
|
+#ZYX20220120
|
|
+ifndef ISMINKERNEL
|
|
+ISMINKERNEL = amax.S
|
|
+#ISMINKERNEL = imin.c
|
|
+endif
|
|
+
|
|
+#ZYX20220120
|
|
+#ifndef ISMAXKERNEL
|
|
+#ISMAXKERNEL = imax.c
|
|
+#endif
|
|
+
|
|
+ifndef IDMINKERNEL
|
|
+IDMINKERNEL = amax.S
|
|
+endif
|
|
+
|
|
+ifndef CCOPYKERNEL
|
|
+CCOPYKERNEL = copy.S
|
|
+endif
|
|
+
|
|
+ifndef ZCOPYKERNEL
|
|
+ZCOPYKERNEL = copy.S
|
|
+endif
|
|
+
|
|
+ifndef SNRM2KERNEL
|
|
+SNRM2KERNEL = snrm2.S
|
|
+endif
|
|
+
|
|
+ifndef DNRM2KERNEL
|
|
+DNRM2KERNEL = dnrm2.S
|
|
+endif
|
|
+
|
|
+ifndef CNRM2KERNEL
|
|
+CNRM2KERNEL = cnrm2.S
|
|
+endif
|
|
+
|
|
+ifndef ZNRM2KERNEL
|
|
+ZNRM2KERNEL = znrm2.S
|
|
+endif
|
|
+
|
|
+ifndef SGEMMKERNEL
|
|
+SGEMMKERNEL = gemm_kernel_4x4.S
|
|
+SGEMM_BETA = gemm_beta.S
|
|
+SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
|
+SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
|
+SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX)
|
|
+SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX)
|
|
+endif
|
|
+
|
|
+ifndef DGEMMKERNEL
|
|
+DGEMMKERNEL = gemm_kernel_4x4.S
|
|
+DGEMM_BETA = gemm_beta.S
|
|
+DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
|
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
|
+DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX)
|
|
+DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX)
|
|
+endif
|
|
+
|
|
+ifndef CGEMMKERNEL
|
|
+CGEMMKERNEL = zgemm_kernel_2x2.S
|
|
+CGEMM_BETA = zgemm_beta.S
|
|
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
|
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
|
+CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX)
|
|
+CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX)
|
|
+endif
|
|
+
|
|
+ifndef ZGEMMKERNEL
|
|
+ZGEMMKERNEL = zgemm_kernel_2x2.S
|
|
+ZGEMM_BETA = zgemm_beta.S
|
|
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
|
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
|
+ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX)
|
|
+ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX)
|
|
+endif
|
|
+
|
|
+SGEMM_BETA = gemm_beta.S
|
|
+DGEMM_BETA = gemm_beta.S
|
|
+CGEMM_BETA = zgemm_beta.S
|
|
+ZGEMM_BETA = zgemm_beta.S
|
|
+
|
|
+ifndef STRSMKERNEL_LN
|
|
+STRSMKERNEL_LN = trsm_kernel_4x4_LN.S
|
|
+endif
|
|
+ifndef STRSMKERNEL_LT
|
|
+STRSMKERNEL_LT = trsm_kernel_4x4_LT.S
|
|
+endif
|
|
+ifndef STRSMKERNEL_RN
|
|
+STRSMKERNEL_RN = trsm_kernel_4x4_LT.S
|
|
+endif
|
|
+ifndef STRSMKERNEL_RT
|
|
+STRSMKERNEL_RT = trsm_kernel_4x4_RT.S
|
|
+endif
|
|
+
|
|
+ifndef DTRSMKERNEL_LN
|
|
+DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S
|
|
+endif
|
|
+ifndef DTRSMKERNEL_LT
|
|
+DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S
|
|
+endif
|
|
+ifndef DTRSMKERNEL_RN
|
|
+DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S
|
|
+endif
|
|
+ifndef DTRSMKERNEL_RT
|
|
+DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S
|
|
+endif
|
|
+
|
|
+ifndef CTRSMKERNEL_LN
|
|
+CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S
|
|
+endif
|
|
+ifndef CTRSMKERNEL_LT
|
|
+CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S
|
|
+endif
|
|
+ifndef CTRSMKERNEL_RN
|
|
+CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S
|
|
+endif
|
|
+ifndef CTRSMKERNEL_RT
|
|
+CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S
|
|
+endif
|
|
+
|
|
+ifndef ZTRSMKERNEL_LN
|
|
+ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S
|
|
+endif
|
|
+ifndef ZTRSMKERNEL_LT
|
|
+ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S
|
|
+endif
|
|
+ifndef ZTRSMKERNEL_RN
|
|
+ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S
|
|
+endif
|
|
+ifndef ZTRSMKERNEL_RT
|
|
+ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S
|
|
+endif
|
|
diff --git a/kernel/sw_64/Makefile b/kernel/sw_64/Makefile
|
|
new file mode 100644
|
|
index 0000000..efae70d
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/Makefile
|
|
@@ -0,0 +1,2 @@
|
|
+clean ::
|
|
+
|
|
diff --git a/kernel/sw_64/amax.S b/kernel/sw_64/amax.S
|
|
new file mode 100644
|
|
index 0000000..300a2f7
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/amax.S
|
|
@@ -0,0 +1,283 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 6 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ nop
|
|
+ .align 4
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr $f16
|
|
+ cmplt $31, N, $2
|
|
+ unop
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fclr $f17
|
|
+ cmplt $31, INCX, $3
|
|
+ unop
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fclr $f18
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ unop
|
|
+
|
|
+ fstd $f5, 24($sp)
|
|
+ fclr $f19
|
|
+ and $2, $3, $0
|
|
+ unop
|
|
+
|
|
+ fstd $f6, 32($sp)
|
|
+ fclr $f0
|
|
+ sra N, 3, $1
|
|
+ beq $0, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ unop
|
|
+ fabs $f20, $f0
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ fabs $f20, $f1
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ fabs $f20, $f2
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f20, $f3
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ fabs $f20, $f4
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ fabs $f20, $f5
|
|
+ unop
|
|
+
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ fabs $f20, $f6
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fabs $f20, $f28
|
|
+ addl X, INCX, X
|
|
+ ldi $1, -1($1)
|
|
+
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ unop
|
|
+ fabs $f20, $f29
|
|
+ fillcs 56 * SIZE(X)
|
|
+
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ fabs $f21, $f30
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ fabs $f22, $f10
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f23, $f11
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f25, $f13
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ CMPLT($f1, $f30), $f17
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f26, $f14
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ CMPLT($f2, $f10), $f18
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f27, $f15
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ CMPLT($f3, $f11), $f19
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ CMPLT($f4, $f12), $f16
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f17, $f30, $f1, $f1
|
|
+ unop
|
|
+ CMPLT($f5, $f13), $f17
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+ fselne $f18, $f10, $f2, $f2
|
|
+ unop
|
|
+ CMPLT($f6, $f14), $f18
|
|
+ unop
|
|
+
|
|
+ fselne $f19, $f11, $f3, $f3
|
|
+ unop
|
|
+ CMPLT($f28, $f15), $f19
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ fabs $f20, $f29
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ fabs $f21, $f30
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ fabs $f22, $f10
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fabs $f25, $f13
|
|
+ CMPLT($f1, $f30), $f17
|
|
+
|
|
+ fabs $f26, $f14
|
|
+ CMPLT($f2, $f10), $f18
|
|
+ fabs $f27, $f15
|
|
+ CMPLT($f3, $f11), $f19
|
|
+
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+ CMPLT($f4, $f12), $f16
|
|
+ fselne $f17, $f30, $f1, $f1
|
|
+ CMPLT($f5, $f13), $f17
|
|
+
|
|
+ fselne $f18, $f10, $f2, $f2
|
|
+ CMPLT($f6, $f14), $f18
|
|
+ fselne $f19, $f11, $f3, $f3
|
|
+ CMPLT($f28, $f15), $f19
|
|
+
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ CMPLT($f2, $f3), $f17
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ CMPLT($f4, $f5), $f18
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ CMPLT($f6, $f28), $f19
|
|
+
|
|
+ fselne $f16, $f1, $f0, $f0
|
|
+ fselne $f17, $f3, $f2, $f2
|
|
+ fselne $f18, $f5, $f4, $f4
|
|
+ fselne $f19, $f28, $f6, $f6
|
|
+
|
|
+ CMPLT($f0, $f2), $f16
|
|
+ CMPLT($f4, $f6), $f17
|
|
+
|
|
+ fselne $f16, $f2, $f0, $f0
|
|
+ fselne $f17, $f6, $f4, $f0
|
|
+
|
|
+ CMPLT($f0, $f4), $f16
|
|
+ fselne $f16, $f4, $f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $End
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/asum.S b/kernel/sw_64/asum.S
|
|
new file mode 100644
|
|
index 0000000..54e7fcb
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/asum.S
|
|
@@ -0,0 +1,230 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define I $19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ fclr s0
|
|
+ unop
|
|
+ fclr t0
|
|
+ ble N, $L999
|
|
+
|
|
+ sra N, 3, I
|
|
+ fclr s1
|
|
+ fclr s2
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fclr t1
|
|
+ SXADDQ INCX, X, X
|
|
+ fclr t2
|
|
+
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ SXADDQ INCX, X, X
|
|
+ fclr s3
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ ldw $31, PREFETCHSIZE * 2 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2,$f24
|
|
+ fmov $f24,s2
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ fabs a2, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3,$f24
|
|
+ fmov $f24,s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a3, t3
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ fabs a4, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fabs a5, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2,$f24
|
|
+ fmov $f24,s2
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ fabs a6, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ fabs a7, t3
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0,$f24
|
|
+ fmov $f24,s0
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1,$f24
|
|
+ fmov $f24,s1
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ fabs a2, t2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ fabs a3, t3
|
|
+
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ fabs a4, t0
|
|
+ ADD s1, t1,$f24
|
|
+ fmov $f24,s1
|
|
+ fabs a5, t1
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ fabs a6, t2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ fabs a7, t3
|
|
+
|
|
+ ADD s1, t1,$f24
|
|
+ fmov $f24,s1
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+
|
|
+ ADD s0, s1, $f24
|
|
+ fmov $f24,s0
|
|
+ ADD s2, s3, $f24
|
|
+ fmov $f24,s2
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ADD s0, s2,$f24
|
|
+ fmov $f24,s0
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, a0
|
|
+ fmov a0,s0
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ fabs a0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s0, t0,$f24
|
|
+ fmov $f24,s0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/asum.S.bak b/kernel/sw_64/asum.S.bak
|
|
new file mode 100644
|
|
index 0000000..faf7827
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/asum.S.bak
|
|
@@ -0,0 +1,206 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define I $19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ fclr s0
|
|
+ unop
|
|
+ fclr t0
|
|
+ ble N, $L999
|
|
+
|
|
+ sra N, 3, I
|
|
+ fclr s1
|
|
+ fclr s2
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fclr t1
|
|
+ SXADDQ INCX, X, X
|
|
+ fclr t2
|
|
+
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ SXADDQ INCX, X, X
|
|
+ fclr s3
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, s0
|
|
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ fabs a2, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a3, t3
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ fabs a4, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fabs a5, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ fabs a6, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ fabs a7, t3
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, s0
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ fabs a2, t2
|
|
+ ADD s3, t3, s3
|
|
+ fabs a3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ fabs a4, t0
|
|
+ ADD s1, t1, s1
|
|
+ fabs a5, t1
|
|
+ ADD s2, t2, s2
|
|
+ fabs a6, t2
|
|
+ ADD s3, t3, s3
|
|
+ fabs a7, t3
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+ ADD s0, s1, s0
|
|
+ ADD s2, s3, s2
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ADD s0, s2, s0
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, s0
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ fabs a0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s0, t0, s0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/asum_simd.S b/kernel/sw_64/asum_simd.S
|
|
new file mode 100644
|
|
index 0000000..f9152ec
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/asum_simd.S
|
|
@@ -0,0 +1,342 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define I $19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ fclr s0
|
|
+ unop
|
|
+ fclr t0
|
|
+ ble N, $L999
|
|
+
|
|
+ cmpeq INCX, 1, $3
|
|
+ beq $3, $Sub
|
|
+ .align 4
|
|
+
|
|
+/*
|
|
+ Unloop 16
|
|
+*/
|
|
+
|
|
+/**
|
|
+ test the address of X
|
|
+**/
|
|
+ and X, (VEC_LEN*SIZE-1), $4
|
|
+ nop
|
|
+ nop
|
|
+ beq $4, $Align
|
|
+
|
|
+/**
|
|
+ process the unalign address of X
|
|
+**/
|
|
+
|
|
+/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/
|
|
+ sra N, 4, I
|
|
+ fclr s1
|
|
+ fclr s2
|
|
+ ble I, $Remain
|
|
+
|
|
+ sra $4, BASE_SHIFT, $4
|
|
+ ldi $3, VEC_LEN
|
|
+ subl $3, $4, $4
|
|
+ nop
|
|
+
|
|
+$UnAlign_X_Loop:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ addl X, SIZE, X
|
|
+ fabs a0, t0
|
|
+ subl $4, 1, $4
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ subl N, 1, N
|
|
+ nop
|
|
+ bgt $4, $UnAlign_X_Loop
|
|
+
|
|
+$Align:
|
|
+ sra N, 4, I
|
|
+ fclr s1
|
|
+ fclr s2
|
|
+ ble I, $Remain
|
|
+
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t0
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t1
|
|
+
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t2
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t3
|
|
+
|
|
+ subl I, 1, I
|
|
+ addl X, 16*SIZE, X
|
|
+ unop
|
|
+ ble I, $MainLoopEnd
|
|
+
|
|
+$MainLoop:
|
|
+
|
|
+ vcpys $f31, a0, a4
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, a1, a5
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ vcpys $f31, a2, a6
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, a3, a7
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VADD t0, a4, t0
|
|
+ subl I, 1, I
|
|
+ VADD t1, a5, t1
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+
|
|
+ VADD t2, a6, t2
|
|
+ addl X, 16*SIZE, X
|
|
+ VADD t3, a7, t3
|
|
+ bgt I, $MainLoop
|
|
+
|
|
+$MainLoopEnd:
|
|
+ /*fabs*/
|
|
+
|
|
+ vcpys $f31, a0, a4
|
|
+ vcpys $f31, a1, a5
|
|
+ vcpys $f31, a2, a6
|
|
+ vcpys $f31, a3, a7
|
|
+
|
|
+ VADD t0, a4, t0
|
|
+ VADD t1, a5, t1
|
|
+ VADD t2, a6, t2
|
|
+ VADD t3, a7, t3
|
|
+
|
|
+ VADD t0, t1, t0
|
|
+ VADD t2, t3, t2
|
|
+ VADD t0, t2, t0
|
|
+ nop
|
|
+
|
|
+ vextf t0, 1, s1
|
|
+ vextf t0, 2, s2
|
|
+ vextf t0, 3, s3
|
|
+ nop
|
|
+
|
|
+ /*sum*/
|
|
+ ADD t0, s1, t0
|
|
+ ADD s2, s3, s2
|
|
+ ADD s0, t0, s0
|
|
+ nop
|
|
+$Remain:
|
|
+ and N, 15, I
|
|
+ ADD s0, s2, s0
|
|
+ unop
|
|
+ ble I, $End
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ bne I, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ ret
|
|
+
|
|
+
|
|
+$Sub:
|
|
+ sra N, 3, I
|
|
+ fclr s1
|
|
+ fclr s2
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fclr t1
|
|
+ SXADDQ INCX, X, X
|
|
+ fclr t2
|
|
+
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ SXADDQ INCX, X, X
|
|
+ fclr s3
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, s0
|
|
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ fabs a2, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a3, t3
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ fabs a4, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fabs a5, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ fabs a6, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ fabs a7, t3
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, s0
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ fabs a2, t2
|
|
+ ADD s3, t3, s3
|
|
+ fabs a3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ fabs a4, t0
|
|
+ ADD s1, t1, s1
|
|
+ fabs a5, t1
|
|
+ ADD s2, t2, s2
|
|
+ fabs a6, t2
|
|
+ ADD s3, t3, s3
|
|
+ fabs a7, t3
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+ ADD s0, s1, s0
|
|
+ ADD s2, s3, s2
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ADD s0, s2, s0
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, s0
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ fabs a0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s0, t0, s0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/axpy.S b/kernel/sw_64/axpy.S
|
|
new file mode 100644
|
|
index 0000000..70e97d6
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/axpy.S
|
|
@@ -0,0 +1,428 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 40
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ ldl $24, 0($sp)
|
|
+ fmov $f19, $f30
|
|
+ ldl $23, 8($sp)
|
|
+ ldi $sp, -16($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ nop
|
|
+ sra $16, 3, $1
|
|
+ fstd $f2, 0($sp)
|
|
+ cmpeq $21, 1, $3
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ cmpeq $23, 1, $4
|
|
+ and $16, 7, $2
|
|
+ ble $16, $End
|
|
+
|
|
+ and $3, $4, $3
|
|
+ fbeq $f30, $End
|
|
+
|
|
+ beq $3, $Sub
|
|
+ ble $1, $Remain
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0*SIZE($20)
|
|
+ LD $f11, 1*SIZE($20)
|
|
+ LD $f12, 2*SIZE($20)
|
|
+ LD $f13, 3*SIZE($20)
|
|
+
|
|
+ LD $f18, 0*SIZE($24)
|
|
+ LD $f19, 1*SIZE($24)
|
|
+ LD $f20, 2*SIZE($24)
|
|
+ LD $f21, 3*SIZE($24)
|
|
+
|
|
+ LD $f14, 4*SIZE($20)
|
|
+ LD $f15, 5*SIZE($20)
|
|
+ LD $f16, 6*SIZE($20)
|
|
+ LD $f17, 7*SIZE($20)
|
|
+
|
|
+ LD $f22, 4*SIZE($24)
|
|
+ LD $f23, 5*SIZE($24)
|
|
+ LD $f24, 6*SIZE($24)
|
|
+ LD $f25, 7*SIZE($24)
|
|
+
|
|
+ subl $1, 1, $1
|
|
+ addl $20, 8*SIZE, $20
|
|
+ unop
|
|
+ ble $1, $LoopEnd
|
|
+ .align 4
|
|
+
|
|
+$Loop:
|
|
+ fillcs PREFETCHSIZE * SIZE($24)
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+
|
|
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
|
+ LD $f10, 0*SIZE($20)
|
|
+ MUL $f30, $f11, $f27
|
|
+ LD $f11, 1*SIZE($20)
|
|
+
|
|
+ MUL $f30, $f12, $f28
|
|
+ LD $f12, 2*SIZE($20)
|
|
+ MUL $f30, $f13, $f29
|
|
+ LD $f13, 3*SIZE($20)
|
|
+
|
|
+ ADD $f18, $f26, $f0
|
|
+ LD $f18, 8*SIZE($24)
|
|
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
|
+ LD $f14, 4*SIZE($20)
|
|
+
|
|
+ ADD $f19, $f27, $f1
|
|
+ LD $f19, 9*SIZE($24)
|
|
+ MUL $f30, $f15, $f27
|
|
+ LD $f15, 5*SIZE($20)
|
|
+
|
|
+ ADD $f20, $f28, $f2
|
|
+ LD $f20, 10*SIZE($24)
|
|
+ MUL $f30, $f16, $f28
|
|
+ LD $f16, 6*SIZE($20)
|
|
+
|
|
+ ADD $f21, $f29, $f3
|
|
+ LD $f21, 11*SIZE($24)
|
|
+ MUL $f30, $f17, $f29
|
|
+ LD $f17, 7*SIZE($20)
|
|
+
|
|
+ ST $f0, 0*SIZE($24)
|
|
+ ADD $f22, $f26, $f0
|
|
+ ST $f1, 1*SIZE($24)
|
|
+ ADD $f23, $f27, $f1
|
|
+
|
|
+ ST $f2, 2*SIZE($24)
|
|
+ ADD $f24, $f28, $f2
|
|
+ ST $f3, 3*SIZE($24)
|
|
+ ADD $f25, $f29, $f3
|
|
+
|
|
+ LD $f22, 12*SIZE($24)
|
|
+ LD $f23, 13*SIZE($24)
|
|
+ LD $f24, 14*SIZE($24)
|
|
+ LD $f25, 15*SIZE($24)
|
|
+
|
|
+ ST $f0, 4*SIZE($24)
|
|
+ ST $f1, 5*SIZE($24)
|
|
+ ST $f2, 6*SIZE($24)
|
|
+ ST $f3, 7*SIZE($24)
|
|
+
|
|
+ subl $1, 1, $1
|
|
+ addl $24, 8*SIZE, $24
|
|
+ addl $20, 8*SIZE, $20
|
|
+ bgt $1, $Loop
|
|
+ .align 4
|
|
+
|
|
+$LoopEnd:
|
|
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
|
+ MUL $f30, $f11, $f27
|
|
+ MUL $f30, $f12, $f28
|
|
+ MUL $f30, $f13, $f29
|
|
+
|
|
+ ADD $f18, $f26, $f0
|
|
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
|
+ ADD $f19, $f27, $f1
|
|
+ MUL $f30, $f15, $f27
|
|
+
|
|
+ ADD $f20, $f28, $f2
|
|
+ MUL $f30, $f16, $f28
|
|
+ ADD $f21, $f29, $f3
|
|
+ MUL $f30, $f17, $f29
|
|
+
|
|
+ ST $f0, 0*SIZE($24)
|
|
+ ADD $f22, $f26, $f0
|
|
+ ST $f1, 1*SIZE($24)
|
|
+ ADD $f23, $f27, $f1
|
|
+
|
|
+ ST $f2, 2*SIZE($24)
|
|
+ ADD $f24, $f28, $f2
|
|
+ ST $f3, 3*SIZE($24)
|
|
+ ADD $f25, $f29, $f3
|
|
+
|
|
+ ST $f0, 4*SIZE($24)
|
|
+ ST $f1, 5*SIZE($24)
|
|
+ ST $f2, 6*SIZE($24)
|
|
+ ST $f3, 7*SIZE($24)
|
|
+ addl $24, 8*SIZE, $24
|
|
+ .align 4
|
|
+
|
|
+$Remain:
|
|
+ ble $2, $End
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+ LD $f10, 0*SIZE($20)
|
|
+ LD $f11, 0*SIZE($24)
|
|
+ addl $20, SIZE, $20
|
|
+ addl $24, SIZE, $24
|
|
+
|
|
+ MUL $f30, $f10, $f12
|
|
+ subl $2, 1, $2
|
|
+ ADD $f11, $f12, $f13
|
|
+ ST $f13, -1*SIZE($24)
|
|
+ bgt $2, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ SXSUBL $16, SIZE, $22
|
|
+ subl $1, 1, $4
|
|
+ ble $1, $SubRemain
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ LD $f11, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ LD $f12, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ LD $f13, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ LD $f18, 0($24)
|
|
+ SXADDQ $23, $24, $22
|
|
+
|
|
+ LD $f19, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f20, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f21, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f14, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ LD $f15, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ LD $f16, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ LD $f17, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ LD $f22, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f23, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f24, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f25, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ unop
|
|
+ ble $4, $SubLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$SubLoop:
|
|
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
|
+ LD $f10, 0($20)
|
|
+ unop
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ MUL $f30, $f11, $f27
|
|
+ LD $f11, 0($20)
|
|
+ unop
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ MUL $f30, $f12, $f28
|
|
+ LD $f12, 0($20)
|
|
+ unop
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ MUL $f30, $f13, $f29
|
|
+ LD $f13, 0($20)
|
|
+ unop
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f18, $f26, $f0
|
|
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
|
+ LD $f14, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f19, $f27, $f1
|
|
+ MUL $f30, $f15, $f27
|
|
+ LD $f15, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f20, $f28, $f2
|
|
+ MUL $f30, $f16, $f28
|
|
+ LD $f16, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f21, $f29, $f3
|
|
+ MUL $f30, $f17, $f29
|
|
+ LD $f17, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ST $f0, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ADD $f22, $f26, $f0
|
|
+ unop
|
|
+
|
|
+ ST $f1, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ADD $f23, $f27, $f1
|
|
+ unop
|
|
+
|
|
+ ST $f2, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ADD $f24, $f28, $f2
|
|
+ unop
|
|
+
|
|
+ ST $f3, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ADD $f25, $f29, $f3
|
|
+ unop
|
|
+
|
|
+ LD $f18, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f19, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f20, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f21, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f22, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f23, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f24, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f25, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ ST $f0, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f1, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f2, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f3, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ bgt $4, $SubLoop
|
|
+ .align 4
|
|
+
|
|
+$SubLoopEnd:
|
|
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
|
+ MUL $f30, $f11, $f27
|
|
+ MUL $f30, $f12, $f28
|
|
+ MUL $f30, $f13, $f29
|
|
+
|
|
+ ADD $f18, $f26, $f0
|
|
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
|
+ ADD $f19, $f27, $f1
|
|
+ MUL $f30, $f15, $f27
|
|
+
|
|
+ ADD $f20, $f28, $f2
|
|
+ MUL $f30, $f16, $f28
|
|
+ ADD $f21, $f29, $f3
|
|
+ MUL $f30, $f17, $f29
|
|
+
|
|
+ ST $f0, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f1, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ ST $f2, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f3, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ ADD $f22, $f26, $f0
|
|
+ ADD $f23, $f27, $f1
|
|
+ ADD $f24, $f28, $f2
|
|
+ ADD $f25, $f29, $f3
|
|
+
|
|
+ ST $f0, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f1, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ ST $f2, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f3, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ ble $2, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ LD $f10, 0($20)
|
|
+ LD $f11, 0($24)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ MUL $f30, $f10, $f12
|
|
+ subl $2, 1, $2
|
|
+ ADD $f11, $f12, $f13
|
|
+ ST $f13, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ bgt $2, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/axpy_simd.S b/kernel/sw_64/axpy_simd.S
|
|
new file mode 100644
|
|
index 0000000..3a2219c
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/axpy_simd.S
|
|
@@ -0,0 +1,655 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+
|
|
+#define PREFETCHSIZE 80
|
|
+// #define PREFETCH_DISTANCE_BYTES 384
|
|
+
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ ldl $24, 0($sp)
|
|
+ fmov $f19, $f30
|
|
+ ldl $23, 8($sp)
|
|
+ ldi $sp, -16($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ cmpeq $21, 1, $3
|
|
+ fstd $f3, 8($sp)
|
|
+ cmpeq $23, 1, $4
|
|
+
|
|
+ ble $16, $End
|
|
+ fbeq $f30, $End
|
|
+ and $3, $4, $3
|
|
+ beq $3, $Sub
|
|
+
|
|
+/**
|
|
+ test the address of Y
|
|
+**/
|
|
+ and $24, (VEC_LEN*SIZE-1), $4
|
|
+ nop
|
|
+ nop
|
|
+ beq $4, $Align_Y_Access
|
|
+ .align 4
|
|
+/**
|
|
+ process the unalign address of Y
|
|
+**/
|
|
+
|
|
+ sra $16, 4, $1
|
|
+ and $16, 15, $2
|
|
+ sra $4, BASE_SHIFT, $4
|
|
+ ble $1, $Remain /*if N is too small(less then unroll size), don't need process unalign Y. Just jump to remain section.*/
|
|
+
|
|
+ ldi $3, VEC_LEN
|
|
+ subl $3, $4, $4
|
|
+
|
|
+$UnAlign_Y_Loop:
|
|
+ LD $f10, 0*SIZE($20)
|
|
+ LD $f11, 0*SIZE($24)
|
|
+ addl $20, SIZE, $20
|
|
+ addl $24, SIZE, $24
|
|
+
|
|
+ MAD $f30, $f10, $f11, $f13
|
|
+ subl $4, 1, $4
|
|
+ subl $16, 1, $16
|
|
+ ST $f13, -1*SIZE($24)
|
|
+ bgt $4, $UnAlign_Y_Loop
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$Align_Y_Access:
|
|
+
|
|
+ nop
|
|
+ sra $16, 4, $1
|
|
+ and $16, 15, $2
|
|
+ ble $1, $Remain
|
|
+
|
|
+/**
|
|
+ test the address of X
|
|
+**/
|
|
+
|
|
+ and $20, (VEC_LEN*SIZE-1), $3
|
|
+ nop
|
|
+ nop
|
|
+ bne $3, $UnAlign_X_Access
|
|
+
|
|
+ .align 4
|
|
+$Align_Access:
|
|
+/***
|
|
+ extern alpha from $f30 to vector 4 in $f13
|
|
+ unloop 16
|
|
+***/
|
|
+ vcpyf $f30, $f13
|
|
+
|
|
+ VLD $f10, 0*VEC_LEN*SIZE($20)
|
|
+/*
|
|
+ LD $f10, 0*SIZE($20)
|
|
+ LD $f11, 1*SIZE($20)
|
|
+ LD $f12, 2*SIZE($20)
|
|
+ LD $f13, 3*SIZE($20)
|
|
+*/
|
|
+ VLD $f18, 0*VEC_LEN*SIZE($24)
|
|
+/*
|
|
+ LD $f18, 0*SIZE($24)
|
|
+ LD $f19, 1*SIZE($24)
|
|
+ LD $f20, 2*SIZE($24)
|
|
+ LD $f21, 3*SIZE($24)
|
|
+*/
|
|
+ VLD $f14, 1*VEC_LEN*SIZE($20)
|
|
+ VLD $f15, 2*VEC_LEN*SIZE($20)
|
|
+ VLD $f16, 3*VEC_LEN*SIZE($20)
|
|
+/*
|
|
+ LD $f14, 4*SIZE($20)
|
|
+ LD $f15, 5*SIZE($20)
|
|
+ LD $f16, 6*SIZE($20)
|
|
+ LD $f17, 7*SIZE($20)
|
|
+*/
|
|
+ VLD $f22, 1*VEC_LEN*SIZE($24)
|
|
+ VLD $f23, 2*VEC_LEN*SIZE($24)
|
|
+ VLD $f24, 3*VEC_LEN*SIZE($24)
|
|
+/*
|
|
+ LD $f22, 4*SIZE($24)
|
|
+ LD $f23, 5*SIZE($24)
|
|
+ LD $f24, 6*SIZE($24)
|
|
+ LD $f25, 7*SIZE($24)
|
|
+*/
|
|
+
|
|
+ subl $1, 1, $1
|
|
+ addl $20, 16*SIZE, $20
|
|
+ unop
|
|
+ ble $1, $LoopEnd
|
|
+ .align 4
|
|
+
|
|
+$Loop:
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE($24)
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+/*
|
|
+ fillcs PREFETCH_DISTANCE_BYTES($24)
|
|
+ fillcs PREFETCH_DISTANCE_BYTES($20)
|
|
+*/
|
|
+
|
|
+ VMAD $f13, $f10, $f18, $f0
|
|
+ VLD $f10, 0*VEC_LEN*SIZE($20)
|
|
+ VLD $f18, 4*VEC_LEN*SIZE($24)
|
|
+/*
|
|
+ MAD $f30, $f10, $f18, $f0 # y += alpha * x
|
|
+ LD $f10, 0*SIZE($20)
|
|
+ MAD $f30, $f11, $f19, $f1
|
|
+ LD $f11, 1*SIZE($20)
|
|
+
|
|
+ MAD $f30, $f12, $f20, $f2
|
|
+ LD $f12, 2*SIZE($20)
|
|
+ MAD $f30, $f13, $f21, $f3
|
|
+ LD $f13, 3*SIZE($20)
|
|
+*/
|
|
+
|
|
+ VMAD $f13, $f14, $f22, $f26
|
|
+ VLD $f14, 1*VEC_LEN*SIZE($20)
|
|
+ VLD $f22, 5*VEC_LEN*SIZE($24)
|
|
+
|
|
+ VMAD $f13, $f15, $f23, $f27
|
|
+ VLD $f15, 2*VEC_LEN*SIZE($20)
|
|
+ VLD $f23, 6*VEC_LEN*SIZE($24)
|
|
+
|
|
+ VMAD $f13, $f16, $f24, $f28
|
|
+ VLD $f16, 3*VEC_LEN*SIZE($20)
|
|
+ VLD $f24, 7*VEC_LEN*SIZE($24)
|
|
+/*
|
|
+ MAD $f30, $f14, $f22, $f26 # y += alpha * x
|
|
+ LD $f14, 4*SIZE($20)
|
|
+ MAD $f30, $f15, $f23, $f27
|
|
+ LD $f15, 5*SIZE($20)
|
|
+
|
|
+ MAD $f30, $f16, $f24, $f28
|
|
+ LD $f16, 6*SIZE($20)
|
|
+ MAD $f30, $f17, $f25, $f29
|
|
+ LD $f17, 7*SIZE($20)
|
|
+*/
|
|
+
|
|
+/*
|
|
+ LD $f18, 8*SIZE($24)
|
|
+ LD $f19, 9*SIZE($24)
|
|
+ LD $f20, 10*SIZE($24)
|
|
+ LD $f21, 11*SIZE($24)
|
|
+
|
|
+ LD $f22, 12*SIZE($24)
|
|
+ LD $f23, 13*SIZE($24)
|
|
+ LD $f24, 14*SIZE($24)
|
|
+ LD $f25, 15*SIZE($24)
|
|
+*/
|
|
+
|
|
+
|
|
+
|
|
+ VST $f0, 0*VEC_LEN*SIZE($24)
|
|
+ VST $f26, 1*VEC_LEN*SIZE($24)
|
|
+ VST $f27, 2*VEC_LEN*SIZE($24)
|
|
+ VST $f28, 3*VEC_LEN*SIZE($24)
|
|
+/*
|
|
+ ST $f0, 0*SIZE($24)
|
|
+ ST $f1, 1*SIZE($24)
|
|
+ ST $f2, 2*SIZE($24)
|
|
+ ST $f3, 3*SIZE($24)
|
|
+
|
|
+ ST $f26, 4*SIZE($24)
|
|
+ ST $f27, 5*SIZE($24)
|
|
+ ST $f28, 6*SIZE($24)
|
|
+ ST $f29, 7*SIZE($24)
|
|
+*/
|
|
+ subl $1, 1, $1
|
|
+ addl $24, 16*SIZE, $24
|
|
+ addl $20, 16*SIZE, $20
|
|
+ bgt $1, $Loop
|
|
+ .align 4
|
|
+
|
|
+$LoopEnd:
|
|
+ VMAD $f13, $f10, $f18, $f0
|
|
+ VST $f0, 0*VEC_LEN*SIZE($24)
|
|
+ VMAD $f13, $f14, $f22, $f26
|
|
+ VST $f26, 1*VEC_LEN*SIZE($24)
|
|
+ VMAD $f13, $f15, $f23, $f27
|
|
+ VST $f27, 2*VEC_LEN*SIZE($24)
|
|
+ VMAD $f13, $f16, $f24, $f28
|
|
+ VST $f28, 3*VEC_LEN*SIZE($24)
|
|
+
|
|
+/*
|
|
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
|
+ MUL $f30, $f11, $f27
|
|
+ MUL $f30, $f12, $f28
|
|
+ MUL $f30, $f13, $f29
|
|
+
|
|
+ ADD $f18, $f26, $f0
|
|
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
|
+ ADD $f19, $f27, $f1
|
|
+ MUL $f30, $f15, $f27
|
|
+
|
|
+ ADD $f20, $f28, $f2
|
|
+ MUL $f30, $f16, $f28
|
|
+ ADD $f21, $f29, $f3
|
|
+ MUL $f30, $f17, $f29
|
|
+
|
|
+ ST $f0, 0*SIZE($24)
|
|
+ ADD $f22, $f26, $f0
|
|
+ ST $f1, 1*SIZE($24)
|
|
+ ADD $f23, $f27, $f1
|
|
+
|
|
+ ST $f2, 2*SIZE($24)
|
|
+ ADD $f24, $f28, $f2
|
|
+ ST $f3, 3*SIZE($24)
|
|
+ ADD $f25, $f29, $f3
|
|
+
|
|
+ ST $f0, 4*SIZE($24)
|
|
+ ST $f1, 5*SIZE($24)
|
|
+ ST $f2, 6*SIZE($24)
|
|
+ ST $f3, 7*SIZE($24)
|
|
+*/
|
|
+ addl $24, 16*SIZE, $24
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$Remain:
|
|
+ ble $2, $End
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+ LD $f10, 0*SIZE($20)
|
|
+ LD $f11, 0*SIZE($24)
|
|
+ addl $20, SIZE, $20
|
|
+ addl $24, SIZE, $24
|
|
+
|
|
+ MAD $f30, $f10, $f11, $f13
|
|
+ subl $2, 1, $2
|
|
+ ST $f13, -1*SIZE($24)
|
|
+ bgt $2, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_Access:
|
|
+/***
|
|
+ extern alpha from $f30 to vector 4 in $f13
|
|
+ unloop 16
|
|
+ unalign access X
|
|
+ align access Y
|
|
+***/
|
|
+ vcpyf $f30, $f13
|
|
+ VLD_UL $f10, 0*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f2, 1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f14, 1*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f3, 2*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f15, 2*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f11, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f16, 3*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f12, 4*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD $f18, 0*VEC_LEN*SIZE($24)
|
|
+ VLD $f22, 1*VEC_LEN*SIZE($24)
|
|
+ VLD $f23, 2*VEC_LEN*SIZE($24)
|
|
+ VLD $f24, 3*VEC_LEN*SIZE($24)
|
|
+
|
|
+ vbisw $f10, $f2, $f10
|
|
+ vbisw $f14, $f3, $f14
|
|
+ vbisw $f15, $f11, $f15
|
|
+ vbisw $f16, $f12, $f16
|
|
+
|
|
+
|
|
+ subl $1, 1, $1
|
|
+ addl $20, 16*SIZE, $20
|
|
+ unop
|
|
+ ble $1, $UnAlign_X_LoopEnd
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_Loop:
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE($24)
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+
|
|
+ VMAD $f13, $f10, $f18, $f0
|
|
+ VLD_UL $f10, 0*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f2, 1*VEC_LEN*SIZE($20)
|
|
+
|
|
+
|
|
+ VMAD $f13, $f14, $f22, $f26
|
|
+ VLD_UL $f14, 1*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f3, 2*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VMAD $f13, $f15, $f23, $f27
|
|
+ VLD_UL $f15, 2*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f11, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VMAD $f13, $f16, $f24, $f28
|
|
+ VLD_UL $f16, 3*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f12, 4*VEC_LEN*SIZE($20)
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+ VLD $f18, 4*VEC_LEN*SIZE($24)
|
|
+ vbisw $f10, $f2, $f10
|
|
+ VLD $f22, 5*VEC_LEN*SIZE($24)
|
|
+ vbisw $f14, $f3, $f14
|
|
+ VLD $f23, 6*VEC_LEN*SIZE($24)
|
|
+ vbisw $f15, $f11, $f15
|
|
+ VLD $f24, 7*VEC_LEN*SIZE($24)
|
|
+ vbisw $f16, $f12, $f16
|
|
+
|
|
+
|
|
+ VST $f0, 0*VEC_LEN*SIZE($24)
|
|
+ VST $f26, 1*VEC_LEN*SIZE($24)
|
|
+ VST $f27, 2*VEC_LEN*SIZE($24)
|
|
+ VST $f28, 3*VEC_LEN*SIZE($24)
|
|
+
|
|
+
|
|
+ subl $1, 1, $1
|
|
+ addl $24, 16*SIZE, $24
|
|
+ addl $20, 16*SIZE, $20
|
|
+ bgt $1, $UnAlign_X_Loop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_LoopEnd:
|
|
+ VMAD $f13, $f10, $f18, $f0
|
|
+ VST $f0, 0*VEC_LEN*SIZE($24)
|
|
+ VMAD $f13, $f14, $f22, $f26
|
|
+ VST $f26, 1*VEC_LEN*SIZE($24)
|
|
+ VMAD $f13, $f15, $f23, $f27
|
|
+ VST $f27, 2*VEC_LEN*SIZE($24)
|
|
+ VMAD $f13, $f16, $f24, $f28
|
|
+ VST $f28, 3*VEC_LEN*SIZE($24)
|
|
+
|
|
+ addl $24, 16*SIZE, $24
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_Remain:
|
|
+ ble $2, $UnAlign_X_End
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_RemainLoop:
|
|
+ LD $f10, 0*SIZE($20)
|
|
+ LD $f11, 0*SIZE($24)
|
|
+ addl $20, SIZE, $20
|
|
+ addl $24, SIZE, $24
|
|
+
|
|
+ MAD $f30, $f10, $f11, $f13
|
|
+ subl $2, 1, $2
|
|
+ ST $f13, -1*SIZE($24)
|
|
+ bgt $2, $UnAlign_X_RemainLoop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$Sub:
|
|
+ sra $16, 3, $1
|
|
+ and $16, 7, $2
|
|
+ SXSUBL $16, SIZE, $22
|
|
+ subl $1, 1, $4
|
|
+
|
|
+ ble $1, $SubRemain
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ LD $f11, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ LD $f12, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ LD $f13, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ LD $f18, 0($24)
|
|
+ SXADDQ $23, $24, $22
|
|
+
|
|
+ LD $f19, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f20, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f21, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f14, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ LD $f15, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ LD $f16, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ LD $f17, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ LD $f22, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f23, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f24, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f25, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ unop
|
|
+ ble $4, $SubLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$SubLoop:
|
|
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
|
+ LD $f10, 0($20)
|
|
+ unop
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ MUL $f30, $f11, $f27
|
|
+ LD $f11, 0($20)
|
|
+ unop
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ MUL $f30, $f12, $f28
|
|
+ LD $f12, 0($20)
|
|
+ unop
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ MUL $f30, $f13, $f29
|
|
+ LD $f13, 0($20)
|
|
+ unop
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f18, $f26, $f0
|
|
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
|
+ LD $f14, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f19, $f27, $f1
|
|
+ MUL $f30, $f15, $f27
|
|
+ LD $f15, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f20, $f28, $f2
|
|
+ MUL $f30, $f16, $f28
|
|
+ LD $f16, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f21, $f29, $f3
|
|
+ MUL $f30, $f17, $f29
|
|
+ LD $f17, 0($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ST $f0, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ADD $f22, $f26, $f0
|
|
+ unop
|
|
+
|
|
+ ST $f1, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ADD $f23, $f27, $f1
|
|
+ unop
|
|
+
|
|
+ ST $f2, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ADD $f24, $f28, $f2
|
|
+ unop
|
|
+
|
|
+ ST $f3, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ADD $f25, $f29, $f3
|
|
+ unop
|
|
+
|
|
+ LD $f18, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f19, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f20, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f21, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f22, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f23, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ LD $f24, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+ LD $f25, 0($22)
|
|
+ SXADDQ $23, $22, $22
|
|
+
|
|
+ ST $f0, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f1, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f2, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f3, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ bgt $4, $SubLoop
|
|
+ .align 4
|
|
+
|
|
+$SubLoopEnd:
|
|
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
|
+ MUL $f30, $f11, $f27
|
|
+ MUL $f30, $f12, $f28
|
|
+ MUL $f30, $f13, $f29
|
|
+
|
|
+ ADD $f18, $f26, $f0
|
|
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
|
+ ADD $f19, $f27, $f1
|
|
+ MUL $f30, $f15, $f27
|
|
+
|
|
+ ADD $f20, $f28, $f2
|
|
+ MUL $f30, $f16, $f28
|
|
+ ADD $f21, $f29, $f3
|
|
+ MUL $f30, $f17, $f29
|
|
+
|
|
+ ST $f0, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f1, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ ST $f2, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f3, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ ADD $f22, $f26, $f0
|
|
+ ADD $f23, $f27, $f1
|
|
+ ADD $f24, $f28, $f2
|
|
+ ADD $f25, $f29, $f3
|
|
+
|
|
+ ST $f0, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f1, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ ST $f2, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ ST $f3, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ ble $2, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ LD $f10, 0($20)
|
|
+ LD $f11, 0($24)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ MUL $f30, $f10, $f12
|
|
+ subl $2, 1, $2
|
|
+ ADD $f11, $f12, $f13
|
|
+ ST $f13, 0($24)
|
|
+ SXADDQ $23, $24, $24
|
|
+
|
|
+ bgt $2, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/cabs.S b/kernel/sw_64/cabs.S
|
|
new file mode 100644
|
|
index 0000000..3f9ed2c
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/cabs.S
|
|
@@ -0,0 +1,72 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+.text
|
|
+ .align 5
|
|
+ .globl NAME
|
|
+ .ent NAME
|
|
+NAME:
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $28, _mcount
|
|
+ jsr $28, ($28), _mcount
|
|
+#endif
|
|
+
|
|
+ LD $f10, 0($16)
|
|
+ LD $f11, SIZE($16)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fabs $f10, $f12
|
|
+ fabs $f11, $f0
|
|
+ ADD $f12, $f0, $f29
|
|
+ fmov $f29, $f0
|
|
+ ret
|
|
+ .end NAME
|
|
+ .ident VERSION
|
|
diff --git a/kernel/sw_64/cabs.S.bak b/kernel/sw_64/cabs.S.bak
|
|
new file mode 100644
|
|
index 0000000..5fa27af
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/cabs.S.bak
|
|
@@ -0,0 +1,71 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+.text
|
|
+ .align 5
|
|
+ .globl NAME
|
|
+ .ent NAME
|
|
+NAME:
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ lda $28, _mcount
|
|
+ jsr $28, ($28), _mcount
|
|
+#endif
|
|
+
|
|
+ LD $f10, 0($16)
|
|
+ LD $f11, SIZE($16)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fabs $f10, $f12
|
|
+ fabs $f11, $f0
|
|
+ ADD $f12, $f0, $f0
|
|
+ ret
|
|
+ .end NAME
|
|
+ .ident VERSION
|
|
diff --git a/kernel/sw_64/cnrm2.S b/kernel/sw_64/cnrm2.S
|
|
new file mode 100644
|
|
index 0000000..25eab03
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/cnrm2.S
|
|
@@ -0,0 +1,440 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ .frame $30,16,$26,0
|
|
+ .mask 0x4000000,-16
|
|
+ ldih $29, 0($27) !gpdisp!1
|
|
+ ldi $29, 0($29) !gpdisp!1
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ ldl $27, sqrt($29) !literal!2
|
|
+ stl $26, 0($sp)
|
|
+
|
|
+ PROFCODE
|
|
+ .prologue 1
|
|
+#else
|
|
+ PROFCODE
|
|
+#endif
|
|
+
|
|
+ fclr a0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, 2 * SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ faddd a0, t0, $f25
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ mov X, XX
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ unop
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, $f25
|
|
+ unop
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ ldi I, -1(I)
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd a0, t0, $f25
|
|
+ mov X, XX
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, $f25
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, $f26
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, $f28
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd $f26, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd $f28, t3, a2
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ fmov $f27, a2
|
|
+ faddd a3, t3, $f28
|
|
+ fmov $f28, a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ ldi X, 2 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, $f25
|
|
+ fmov $f25, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, $f26
|
|
+ fmov $f26, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 2, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ faddd a0, t0, $f25
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ fmuld x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fmuld x3, x3, t3
|
|
+ unop
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ fmuld x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ fmuld x5, x5, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ fmuld x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ faddd a0, t0, $f25
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ fmuld x1, x1, t1
|
|
+ faddd a2, t2, $f27
|
|
+ fmuld x2, x2, t2
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ fmuld x3, x3, t3
|
|
+ faddd $f25, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+ faddd $f27, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ faddd a2, t2, $f27
|
|
+ fmov $f27, a2
|
|
+ faddd a3, t3, $f28
|
|
+ fmov $f28, a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 3, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, $f25
|
|
+ fmov $f25, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, $f26
|
|
+ fmov $f26, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ faddd a0, t0, $f25
|
|
+ fmov $f25, a0
|
|
+ faddd a1, t1, $f26
|
|
+ fmov $f26, a1
|
|
+
|
|
+ faddd a0, a1, $f25
|
|
+ fmov $f25, a0
|
|
+ faddd a2, a3, $f26
|
|
+ fmov $f26, a2
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ faddd a0, a2, $f16
|
|
+ jsr $26, ($27), sqrt !lituse_jsr!2
|
|
+
|
|
+ ldih $29, 0($26) !gpdisp!3
|
|
+ ldi $29, 0($29) !gpdisp!3
|
|
+#else
|
|
+ faddd a0, a2, $f25
|
|
+ fmov $f25, a0
|
|
+ fsqrtd a0, $f25
|
|
+ fmov $f25, a0
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ ldl $26, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/cnrm2.S.bak b/kernel/sw_64/cnrm2.S.bak
|
|
new file mode 100644
|
|
index 0000000..b2e80e0
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/cnrm2.S.bak
|
|
@@ -0,0 +1,426 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ .frame $30,16,$26,0
|
|
+ .mask 0x4000000,-16
|
|
+ ldih $29, 0($27) !gpdisp!1
|
|
+ ldi $29, 0($29) !gpdisp!1
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ ldl $27, sqrt($29) !literal!2
|
|
+ stq $26, 0($sp)
|
|
+
|
|
+ PROFCODE
|
|
+ .prologue 1
|
|
+#else
|
|
+ PROFCODE
|
|
+#endif
|
|
+
|
|
+ fclr a0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, 2 * SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ faddd a0, t0, a0
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ mov X, XX
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ ldi I, -1(I)
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd a0, t0, a0
|
|
+ mov X, XX
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ faddd a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ ldi X, 2 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 2, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ faddd a0, t0, a0
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ fmuld x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fmuld x3, x3, t3
|
|
+ unop
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ fmuld x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ fmuld x5, x5, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ fmuld x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ faddd a0, t0, a0
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x2, x2, t2
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x3, x3, t3
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ faddd a2, t2, a2
|
|
+ faddd a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 3, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ faddd a0, t0, a0
|
|
+ faddd a1, t1, a1
|
|
+
|
|
+ faddd a0, a1, a0
|
|
+ faddd a2, a3, a2
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ faddd a0, a2, $f16
|
|
+ jsr $26, ($27), sqrt !lituse_jsr!2
|
|
+
|
|
+ ldih $29, 0($26) !gpdisp!3
|
|
+ ldi $29, 0($29) !gpdisp!3
|
|
+#else
|
|
+ faddd a0, a2, a0
|
|
+ fsqrtd a0, a0
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ ldl $26, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/copy.S b/kernel/sw_64/copy.S
|
|
new file mode 100644
|
|
index 0000000..c960ac1
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/copy.S
|
|
@@ -0,0 +1,379 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ cmpeq INCX, 1, $0
|
|
+ ble N, $End
|
|
+#ifndef COMPLEX
|
|
+ sra N, 4, $4
|
|
+#else
|
|
+ sra N, 3, $4
|
|
+#endif
|
|
+ cmpeq INCY, 1, $1
|
|
+
|
|
+ and $0, $1, $0
|
|
+ beq $0, $Sub
|
|
+#ifndef COMPLEX
|
|
+ and N, 15, $5
|
|
+#else
|
|
+ and N, 7, $5
|
|
+#endif
|
|
+ ble $4, $Remain
|
|
+
|
|
+ LD $f10, 0*SIZE(X)
|
|
+ LD $f11, 1*SIZE(X)
|
|
+ LD $f12, 2*SIZE(X)
|
|
+ LD $f13, 3*SIZE(X)
|
|
+ LD $f14, 4*SIZE(X)
|
|
+ LD $f15, 5*SIZE(X)
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ LD $f17, 7*SIZE(X)
|
|
+
|
|
+ LD $f18, 8*SIZE(X)
|
|
+ LD $f19, 9*SIZE(X)
|
|
+ LD $f20, 10*SIZE(X)
|
|
+ LD $f21, 11*SIZE(X)
|
|
+ LD $f22, 12*SIZE(X)
|
|
+ LD $f23, 13*SIZE(X)
|
|
+ LD $f24, 14*SIZE(X)
|
|
+ LD $f25, 15*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ldi X, 16*SIZE(X)
|
|
+ ble $4, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ ST $f10, 0*SIZE(Y)
|
|
+ ST $f11, 1*SIZE(Y)
|
|
+ ST $f12, 2*SIZE(Y)
|
|
+ ST $f13, 3*SIZE(Y)
|
|
+
|
|
+ LD $f10, 0*SIZE(X)
|
|
+ LD $f11, 1*SIZE(X)
|
|
+ LD $f12, 2*SIZE(X)
|
|
+ LD $f13, 3*SIZE(X)
|
|
+
|
|
+ ST $f14, 4*SIZE(Y)
|
|
+ ST $f15, 5*SIZE(Y)
|
|
+ ST $f16, 6*SIZE(Y)
|
|
+ ST $f17, 7*SIZE(Y)
|
|
+
|
|
+ LD $f14, 4*SIZE(X)
|
|
+ LD $f15, 5*SIZE(X)
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ LD $f17, 7*SIZE(X)
|
|
+
|
|
+ ST $f18, 8*SIZE(Y)
|
|
+ ST $f19, 9*SIZE(Y)
|
|
+ ST $f20, 10*SIZE(Y)
|
|
+ ST $f21, 11*SIZE(Y)
|
|
+
|
|
+ LD $f18, 8*SIZE(X)
|
|
+ LD $f19, 9*SIZE(X)
|
|
+ LD $f20, 10*SIZE(X)
|
|
+ LD $f21, 11*SIZE(X)
|
|
+
|
|
+ ST $f22, 12*SIZE(Y)
|
|
+ ST $f23, 13*SIZE(Y)
|
|
+ ST $f24, 14*SIZE(Y)
|
|
+ ST $f25, 15*SIZE(Y)
|
|
+
|
|
+ LD $f22, 12*SIZE(X)
|
|
+ LD $f23, 13*SIZE(X)
|
|
+ LD $f24, 14*SIZE(X)
|
|
+ LD $f25, 15*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ ldi X, 16*SIZE(X)
|
|
+ bgt $4, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+ ST $f10, 0*SIZE(Y)
|
|
+ ST $f11, 1*SIZE(Y)
|
|
+ ST $f12, 2*SIZE(Y)
|
|
+ ST $f13, 3*SIZE(Y)
|
|
+ ST $f14, 4*SIZE(Y)
|
|
+ ST $f15, 5*SIZE(Y)
|
|
+ ST $f16, 6*SIZE(Y)
|
|
+ ST $f17, 7*SIZE(Y)
|
|
+
|
|
+ ST $f18, 8*SIZE(Y)
|
|
+ ST $f19, 9*SIZE(Y)
|
|
+ ST $f20, 10*SIZE(Y)
|
|
+ ST $f21, 11*SIZE(Y)
|
|
+ ST $f22, 12*SIZE(Y)
|
|
+ ST $f23, 13*SIZE(Y)
|
|
+ ST $f24, 14*SIZE(Y)
|
|
+ ST $f25, 15*SIZE(Y)
|
|
+
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ .align 4
|
|
+
|
|
+$Remain:
|
|
+ ble $5, $End
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+#ifndef COMPLEX
|
|
+ LD $f10, 0*SIZE(X)
|
|
+ ldi X, 1*SIZE(X)
|
|
+ ST $f10, 0*SIZE(Y)
|
|
+ ldi Y, 1*SIZE(Y)
|
|
+#else
|
|
+ LD $f10, 0*SIZE(X)
|
|
+ LD $f11, 1*SIZE(X)
|
|
+ ldi X, 2*SIZE(X)
|
|
+ ST $f10, 0*SIZE(Y)
|
|
+ ST $f11, 1*SIZE(Y)
|
|
+ ldi Y, 2*SIZE(Y)
|
|
+#endif
|
|
+ subl $5, 1, $5
|
|
+ bgt $5, $RemainLoop
|
|
+ .align 4
|
|
+$End:
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+#ifdef COMPLEX
|
|
+ addl INCX, INCX, INCX
|
|
+ addl INCY, INCY, INCY
|
|
+ and N, 7, $5
|
|
+#else
|
|
+ and N, 15, $5
|
|
+#endif
|
|
+ ble $4, $SubRemain
|
|
+ .align 4
|
|
+
|
|
+$SubMainLoop:
|
|
+#ifndef COMPLEX
|
|
+ LD $f10, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f11, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f12, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f13, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f14, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f16, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f17, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f18, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f19, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f20, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f21, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f22, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f23, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f24, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f25, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST $f10, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f11, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f12, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f13, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f14, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f15, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f16, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f17, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f18, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f19, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f20, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f21, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f22, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f23, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f24, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f25, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+#else
|
|
+ LD $f10, 0(X)
|
|
+ LD $f11, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f12, 0(X)
|
|
+ LD $f13, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f14, 0(X)
|
|
+ LD $f15, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f16, 0(X)
|
|
+ LD $f17, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f18, 0(X)
|
|
+ LD $f19, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f20, 0(X)
|
|
+ LD $f21, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f22, 0(X)
|
|
+ LD $f23, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f24, 0(X)
|
|
+ LD $f25, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST $f10, 0(Y)
|
|
+ ST $f11, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f12, 0(Y)
|
|
+ ST $f13, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f14, 0(Y)
|
|
+ ST $f15, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f16, 0(Y)
|
|
+ ST $f17, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f18, 0(Y)
|
|
+ ST $f19, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f20, 0(Y)
|
|
+ ST $f21, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f22, 0(Y)
|
|
+ ST $f23, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f24, 0(Y)
|
|
+ ST $f25, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+#endif
|
|
+ subl $4, 1, $4
|
|
+ bgt $4, $SubMainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ ble $5, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+ $SubRemainLoop:
|
|
+#ifndef COMPLEX
|
|
+ LD $f10, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ST $f10, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+#else
|
|
+ LD $f10, 0(X)
|
|
+ LD $f11, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ST $f10, 0(Y)
|
|
+ ST $f11, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+#endif
|
|
+ subl $5, 1, $5
|
|
+ bgt $5, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/copy_simd.S b/kernel/sw_64/copy_simd.S
|
|
new file mode 100644
|
|
index 0000000..84e96a9
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/copy_simd.S
|
|
@@ -0,0 +1,563 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ cmpeq INCX, 1, $0
|
|
+ ble N, $End
|
|
+#ifndef COMPLEX
|
|
+ sra N, 4, $4
|
|
+#else
|
|
+ sra N, 3, $4
|
|
+#endif
|
|
+ cmpeq INCY, 1, $1
|
|
+
|
|
+ and $0, $1, $0
|
|
+ beq $0, $Sub
|
|
+#ifndef COMPLEX
|
|
+ and N, 15, $5
|
|
+#else
|
|
+ and N, 7, $5
|
|
+#endif
|
|
+ ble $4, $Remain
|
|
+
|
|
+/**
|
|
+ test the address of X & Y
|
|
+**/
|
|
+
|
|
+ and Y, (VEC_LEN*SIZE-1), $6
|
|
+ and X, (VEC_LEN*SIZE-1), $7
|
|
+ bgt $6, $UnAlign_Y_ACCESS
|
|
+ bgt $7, $UnAlign_X_ACCESS
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$Align:
|
|
+ VLD $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f13, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ldi X, 16*SIZE(X)
|
|
+ ble $4, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+
|
|
+ VST $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VST $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VST $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VST $f13, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VLD $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f13, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ ldi X, 16*SIZE(X)
|
|
+ bgt $4, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+
|
|
+ VST $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VST $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VST $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VST $f13, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ .align 4
|
|
+
|
|
+$Remain:
|
|
+ ble $5, $End
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+#ifndef COMPLEX
|
|
+ LD $f10, 0*SIZE(X)
|
|
+ ldi X, 1*SIZE(X)
|
|
+ ST $f10, 0*SIZE(Y)
|
|
+ ldi Y, 1*SIZE(Y)
|
|
+#else
|
|
+ LD $f10, 0*SIZE(X)
|
|
+ LD $f11, 1*SIZE(X)
|
|
+ ldi X, 2*SIZE(X)
|
|
+ ST $f10, 0*SIZE(Y)
|
|
+ ST $f11, 1*SIZE(Y)
|
|
+ ldi Y, 2*SIZE(Y)
|
|
+#endif
|
|
+ subl $5, 1, $5
|
|
+ bgt $5, $RemainLoop
|
|
+ .align 4
|
|
+$End:
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_ACCESS:
|
|
+ and Y, (VEC_LEN*SIZE-1), $7
|
|
+ nop
|
|
+ nop
|
|
+ bgt $7, $UnAlign_XY_ACCESS
|
|
+ .align 4
|
|
+
|
|
+ VLD_UL $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f14, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD_UL $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f15, 2*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD_UL $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f16, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+
|
|
+ VLD_UL $f13, 3*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f17, 4*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ vbisw $f10, $f14, $f10
|
|
+ ldi X, 16*SIZE(X)
|
|
+ vbisw $f11, $f15, $f11
|
|
+
|
|
+ vbisw $f12, $f16, $f12
|
|
+ vbisw $f13, $f17, $f13
|
|
+ nop
|
|
+ ble $4, $UnAlign_X_MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+
|
|
+ VST $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VST $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VST $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VST $f13, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VLD_UL $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f14, 1*VEC_LEN*SIZE(X)
|
|
+ VLD_UL $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f15, 2*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD_UL $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f16, 3*VEC_LEN*SIZE(X)
|
|
+ VLD_UL $f13, 3*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f17, 4*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ vbisw $f10, $f14, $f10
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ vbisw $f11, $f15, $f11
|
|
+
|
|
+ vbisw $f12, $f16, $f12
|
|
+ ldi X, 16*SIZE(X)
|
|
+ vbisw $f13, $f17, $f13
|
|
+ bgt $4, $UnAlign_X_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_MainLoopEnd:
|
|
+
|
|
+ VST $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VST $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VST $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VST $f13, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ ble $5, $End
|
|
+ jmp $RemainLoop
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_Y_ACCESS:
|
|
+ and X, (VEC_LEN*SIZE-1), $7
|
|
+ nop
|
|
+ nop
|
|
+ bgt $7, $UnAlign_XY_ACCESS
|
|
+ .align 4
|
|
+
|
|
+ VLD $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f13, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ldi X, 16*SIZE(X)
|
|
+ ble $4, $UnAlign_Y_MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_Y_MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+
|
|
+ VST_UL $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f10, 1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f11, 2*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f12, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f13, 3*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f13, 4*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VLD $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f13, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ ldi X, 16*SIZE(X)
|
|
+ bgt $4, $UnAlign_Y_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_Y_MainLoopEnd:
|
|
+
|
|
+ VST_UL $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f10, 1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f11, 2*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f12, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f13, 3*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f13, 4*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ ble $5, $End
|
|
+ jmp $RemainLoop
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_XY_ACCESS:
|
|
+
|
|
+ VLD_UL $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f14, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD_UL $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f15, 2*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD_UL $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f16, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+
|
|
+ VLD_UL $f13, 3*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f17, 4*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ vbisw $f10, $f14, $f10
|
|
+ ldi X, 16*SIZE(X)
|
|
+ vbisw $f11, $f15, $f11
|
|
+
|
|
+ vbisw $f12, $f16, $f12
|
|
+ vbisw $f13, $f17, $f13
|
|
+ nop
|
|
+ ble $4, $UnAlign_XY_MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_XY_MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+
|
|
+ VST_UL $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f10, 1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f11, 2*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f12, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f13, 3*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f13, 4*VEC_LEN*SIZE(Y)
|
|
+
|
|
+
|
|
+ VLD_UL $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f14, 1*VEC_LEN*SIZE(X)
|
|
+ VLD_UL $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f15, 2*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD_UL $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f16, 3*VEC_LEN*SIZE(X)
|
|
+ VLD_UL $f13, 3*VEC_LEN*SIZE(X)
|
|
+ VLD_UH $f17, 4*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ vbisw $f10, $f14, $f10
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ vbisw $f11, $f15, $f11
|
|
+
|
|
+ vbisw $f12, $f16, $f12
|
|
+ ldi X, 16*SIZE(X)
|
|
+ vbisw $f13, $f17, $f13
|
|
+ bgt $4, $UnAlign_XY_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_XY_MainLoopEnd:
|
|
+
|
|
+ VST_UL $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f10, 1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f11, 2*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f12, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST_UL $f13, 3*VEC_LEN*SIZE(Y)
|
|
+ VST_UH $f13, 4*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ ldi Y, 16*SIZE(Y)
|
|
+ ble $5, $End
|
|
+ jmp $RemainLoop
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+#ifdef COMPLEX
|
|
+ addl INCX, INCX, INCX
|
|
+ addl INCY, INCY, INCY
|
|
+ and N, 7, $5
|
|
+#else
|
|
+ and N, 15, $5
|
|
+#endif
|
|
+ ble $4, $SubRemain
|
|
+ .align 4
|
|
+
|
|
+$SubMainLoop:
|
|
+#ifndef COMPLEX
|
|
+ LD $f10, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f11, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f12, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f13, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f14, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f16, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f17, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f18, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f19, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f20, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f21, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f22, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f23, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f24, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f25, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST $f10, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f11, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f12, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f13, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f14, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f15, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f16, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f17, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f18, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f19, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f20, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f21, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f22, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f23, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f24, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ST $f25, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+#else
|
|
+ LD $f10, 0(X)
|
|
+ LD $f11, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f12, 0(X)
|
|
+ LD $f13, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f14, 0(X)
|
|
+ LD $f15, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f16, 0(X)
|
|
+ LD $f17, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f18, 0(X)
|
|
+ LD $f19, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f20, 0(X)
|
|
+ LD $f21, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f22, 0(X)
|
|
+ LD $f23, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD $f24, 0(X)
|
|
+ LD $f25, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST $f10, 0(Y)
|
|
+ ST $f11, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f12, 0(Y)
|
|
+ ST $f13, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f14, 0(Y)
|
|
+ ST $f15, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f16, 0(Y)
|
|
+ ST $f17, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f18, 0(Y)
|
|
+ ST $f19, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f20, 0(Y)
|
|
+ ST $f21, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f22, 0(Y)
|
|
+ ST $f23, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ST $f24, 0(Y)
|
|
+ ST $f25, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+#endif
|
|
+ subl $4, 1, $4
|
|
+ bgt $4, $SubMainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ ble $5, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+ $SubRemainLoop:
|
|
+#ifndef COMPLEX
|
|
+ LD $f10, 0(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ST $f10, 0(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+#else
|
|
+ LD $f10, 0(X)
|
|
+ LD $f11, SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ST $f10, 0(Y)
|
|
+ ST $f11, SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+#endif
|
|
+ subl $5, 1, $5
|
|
+ bgt $5, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/cscal.S b/kernel/sw_64/cscal.S
|
|
new file mode 100644
|
|
index 0000000..bba3137
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/cscal.S
|
|
@@ -0,0 +1,217 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+ .globl NAME
|
|
+ .ent NAME
|
|
+
|
|
+NAME:
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ lda $28, _mcount
|
|
+ jsr $28, ($28), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef C_INTERFACE
|
|
+ ldl $16, 0($16) # n
|
|
+ mov $18, $20 # Store Address
|
|
+ ldl $19, 0($19) # incx
|
|
+ nop
|
|
+
|
|
+ LD $f1, 0($17) # alpha
|
|
+#else
|
|
+ mov $18, $20 # Store Address
|
|
+ fmov $f17, $f1 # alpha
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ sra $16, 1, $21 # 4-unrolling
|
|
+ ble $16, $End
|
|
+
|
|
+ lda $23, -1($19)
|
|
+ ble $19, $End
|
|
+
|
|
+ bgt $23, $INC_NOT_1
|
|
+ .align 4
|
|
+
|
|
+ ble $21, $Sub
|
|
+ lda $21, -1($21)
|
|
+ LD $f10, 0*SIZE($18)
|
|
+ LD $f11, 1*SIZE($18)
|
|
+
|
|
+ LD $f12, 2*SIZE($18)
|
|
+ LD $f13, 3*SIZE($18)
|
|
+ lda $18, 4*SIZE($18)
|
|
+ ble $21, $MainRemain
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ MUL $f10, $f1, $f20
|
|
+ LD $f10, 0*SIZE($18)
|
|
+ MUL $f11, $f1, $f21
|
|
+ LD $f11, 1*SIZE($18)
|
|
+
|
|
+ MUL $f12, $f1, $f22
|
|
+ LD $f12, 2*SIZE($18)
|
|
+ MUL $f13, $f1, $f23
|
|
+ LD $f13, 3*SIZE($18)
|
|
+
|
|
+ lda $18, 4*SIZE($18)
|
|
+ lda $21, -1($21)
|
|
+
|
|
+ ST $f20, 0*SIZE($20)
|
|
+ ST $f21, 1*SIZE($20)
|
|
+ ST $f22, 2*SIZE($20)
|
|
+ ST $f23, 3*SIZE($20)
|
|
+ lda $20, 4*SIZE($20)
|
|
+
|
|
+ bgt $21, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainRemain:
|
|
+ MUL $f10, $f1, $f20
|
|
+ MUL $f11, $f1, $f21
|
|
+ MUL $f12, $f1, $f22
|
|
+ MUL $f13, $f1, $f23
|
|
+
|
|
+ ST $f20, 0*SIZE($20)
|
|
+ ST $f21, 1*SIZE($20)
|
|
+ ST $f22, 2*SIZE($20)
|
|
+ ST $f23, 3*SIZE($20)
|
|
+ lda $20, 4*SIZE($20)
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ blbc $16, $End
|
|
+ LD $f10, 0*SIZE($18)
|
|
+ LD $f11, 1*SIZE($18)
|
|
+ MUL $f10, $f1, $f20
|
|
+ MUL $f11, $f1, $f21
|
|
+ ST $f20, 0*SIZE($20)
|
|
+ ST $f21, 1*SIZE($20)
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$INC_NOT_1:
|
|
+ addl $19, $19, $19
|
|
+ ble $21, $INC_Sub
|
|
+ lda $21, -1($21)
|
|
+
|
|
+ LD $f10, 0*SIZE($18)
|
|
+ LD $f11, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f12, 0*SIZE($18)
|
|
+ LD $f13, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+ ble $21, $INC_MainRemain
|
|
+ .align 4
|
|
+
|
|
+$INC_MainLoop:
|
|
+ MUL $f10, $f1, $f20
|
|
+ LD $f10, 0*SIZE($18)
|
|
+ MUL $f11, $f1, $f21
|
|
+ LD $f11, 1*SIZE($18)
|
|
+
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ MUL $f12, $f1, $f22
|
|
+ LD $f12, 0*SIZE($18)
|
|
+ MUL $f13, $f1, $f23
|
|
+ LD $f13, 1*SIZE($18)
|
|
+
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ ST $f20, 0*SIZE($20)
|
|
+ lda $21, -1($21)
|
|
+ ST $f21, 1*SIZE($20)
|
|
+ SXADDQ $19, $20, $20
|
|
+
|
|
+ ST $f22, 0*SIZE($20)
|
|
+ ST $f23, 1*SIZE($20)
|
|
+ SXADDQ $19, $20, $20
|
|
+ unop
|
|
+ bgt $21, $INC_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$INC_MainRemain:
|
|
+ MUL $f10, $f1, $f20
|
|
+ MUL $f11, $f1, $f21
|
|
+ MUL $f12, $f1, $f22
|
|
+ MUL $f13, $f1, $f23
|
|
+
|
|
+ ST $f20, 0*SIZE($20)
|
|
+ ST $f21, 1*SIZE($20)
|
|
+ SXADDQ $19, $20, $20
|
|
+
|
|
+ ST $f22, 0*SIZE($20)
|
|
+ ST $f23, 1*SIZE($20)
|
|
+ SXADDQ $19, $20, $20
|
|
+ .align 4
|
|
+
|
|
+$INC_Sub:
|
|
+ blbc $16, $INC_End
|
|
+
|
|
+ LD $f10, 0*SIZE($18)
|
|
+ LD $f11, 1*SIZE($18)
|
|
+ MUL $f10, $f1, $f20
|
|
+ MUL $f11, $f1, $f21
|
|
+
|
|
+ ST $f20, 0*SIZE($20)
|
|
+ ST $f21, 1*SIZE($20)
|
|
+ .align 4
|
|
+
|
|
+$INC_End:
|
|
+ ret
|
|
+ .end NAME
|
|
+ .ident VERSION
|
|
diff --git a/kernel/sw_64/dnrm2.S b/kernel/sw_64/dnrm2.S
|
|
new file mode 100644
|
|
index 0000000..89cf787
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/dnrm2.S
|
|
@@ -0,0 +1,490 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ .frame $30,16,$26,0
|
|
+ .mask 0x4000000,-16
|
|
+ ldih $29, 0($27) !gpdisp!1
|
|
+ ldi $29, 0($29) !gpdisp!1
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ ldl $27, sqrt($29) !literal!2
|
|
+ stl $26, 0($sp)
|
|
+
|
|
+ PROFCODE
|
|
+ .prologue 1
|
|
+#else
|
|
+ PROFCODE
|
|
+#endif
|
|
+
|
|
+ fclr a0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 4, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ faddd a0, t0,$f24
|
|
+ fmov $f24,a0
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1,$f24
|
|
+ fmov $f24,a1
|
|
+ mov X, XX
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ #unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ #unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ #unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ #unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ #unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ #unop
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ #unop
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ #unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ #unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ ldi I, -1(I)
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ #unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd a0, t0,$f24
|
|
+ fmov $f24,a0
|
|
+ mov X, XX
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ #unop
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ #unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ #unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ #unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1,$f24
|
|
+ fmov $f24,a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2,$f24
|
|
+ fmov $f24,a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 15, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ fmuld x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ faddd a0, t0,$f24
|
|
+ fmov $f24,a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ fmuld x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fmuld x3, x3, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ fmuld x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ fmuld x5, x5, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ fmuld x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a1, t1, $f24
|
|
+ fmov $f24,a1
|
|
+ faddd a2, t2, $f24
|
|
+ fmov $f24,a2
|
|
+ faddd a3, t3, $f24
|
|
+ fmov $f24,a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0,$f24
|
|
+ fmov $f24,a0
|
|
+ fmuld x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ faddd a0, t0, $f24
|
|
+ fmov $f24,a0
|
|
+
|
|
+ faddd a0, a1, $f24
|
|
+ fmov $f24,a1
|
|
+ faddd a2, a3, $f24
|
|
+ fmov $f24,a2
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ faddd a0, a2, $f16
|
|
+ jsr $26, ($27), sqrt !lituse_jsr!2
|
|
+
|
|
+ ldih $29, 0($26) !gpdisp!3
|
|
+ ldi $29, 0($29) !gpdisp!3
|
|
+#else
|
|
+ faddd a0, a2, $f24
|
|
+ fsqrtd $f24, a0
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ ldl $26, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/dnrm2.S.bak b/kernel/sw_64/dnrm2.S.bak
|
|
new file mode 100644
|
|
index 0000000..753c90b
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/dnrm2.S.bak
|
|
@@ -0,0 +1,431 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ .frame $30,16,$26,0
|
|
+ .mask 0x4000000,-16
|
|
+ ldih $29, 0($27) !gpdisp!1
|
|
+ ldi $29, 0($29) !gpdisp!1
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ ldl $27, sqrt($29) !literal!2
|
|
+ stq $26, 0($sp)
|
|
+
|
|
+ PROFCODE
|
|
+ .prologue 1
|
|
+#else
|
|
+ PROFCODE
|
|
+#endif
|
|
+
|
|
+ fclr a0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 4, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ faddd a0, t0, a0
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ mov X, XX
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ ldi I, -1(I)
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd a0, t0, a0
|
|
+ mov X, XX
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ faddd a2, t2, a2
|
|
+ faddd a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 15, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ faddd a0, t0, a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ fmuld x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fmuld x3, x3, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ fmuld x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ fmuld x5, x5, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ fmuld x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ faddd a0, t0, a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ faddd a2, t2, a2
|
|
+ faddd a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ faddd a0, t0, a0
|
|
+
|
|
+ faddd a0, a1, a0
|
|
+ faddd a2, a3, a2
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ faddd a0, a2, $f16
|
|
+ jsr $26, ($27), sqrt !lituse_jsr!2
|
|
+
|
|
+ ldih $29, 0($26) !gpdisp!3
|
|
+ ldi $29, 0($29) !gpdisp!3
|
|
+#else
|
|
+ faddd a0, a2, a0
|
|
+ fsqrtd a0, a0
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ ldl $26, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/dot.S b/kernel/sw_64/dot.S
|
|
new file mode 100644
|
|
index 0000000..513eada
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/dot.S
|
|
@@ -0,0 +1,607 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+
|
|
+#define I $5
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f30
|
|
+#define s2 $f1
|
|
+#define s3 $f2
|
|
+
|
|
+#define a0 $f10
|
|
+#define a1 $f11
|
|
+#define a2 $f12
|
|
+#define a3 $f13
|
|
+#define a4 $f14
|
|
+#define a5 $f15
|
|
+#define a6 $f16
|
|
+#define a7 $f17
|
|
+
|
|
+#define b0 $f18
|
|
+#define b1 $f19
|
|
+#define b2 $f20
|
|
+#define b3 $f21
|
|
+#define b4 $f22
|
|
+#define b5 $f23
|
|
+#define b6 $f24
|
|
+#define b7 $f25
|
|
+
|
|
+#define t0 $f26
|
|
+#define t1 $f27
|
|
+#define t2 $f28
|
|
+#define t3 $f29
|
|
+
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ fclr s0
|
|
+ fstd $f2, 0($sp)
|
|
+#ifndef ZYX20220111
|
|
+ fstd $f3, 8($sp)
|
|
+#endif
|
|
+ fclr s1
|
|
+
|
|
+ fclr s2
|
|
+ nop
|
|
+ fclr s3
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr t0
|
|
+ cmpeq INCX, 1, $21
|
|
+ fclr t1
|
|
+ cmpeq INCY, 1, $22
|
|
+ fclr t2
|
|
+ and $21, $22, $22
|
|
+ fclr t3
|
|
+ beq $22, $L20
|
|
+
|
|
+#ifndef DOUBLE
|
|
+ srl N, 4, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ addl X, 16 * SIZE, X
|
|
+ subl I, 1, I
|
|
+
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ fillcs PREFETCHSIZE * 2 * SIZE(Y)
|
|
+ addl X, 16 * SIZE, X
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -10 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -9 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a0, -24 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -23 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b0, -8 * SIZE(Y)
|
|
+ MUL a2, b2, $f3
|
|
+ fmov $f3, t2
|
|
+ LD b1, -7 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a2, -22 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -21 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b2, -6 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, -5 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a4, -20 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -19 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b4, -4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, -3 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a6, -18 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -17 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a0, -16 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -15 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a2, -14 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -13 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a4, -12 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -11 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a6, -10 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -9 * SIZE(X)
|
|
+
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ bgt I, $L12
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6,-10 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -9 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a0, -8 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -7 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b0, -8 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, -7 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a2, -6 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -5 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b2, -6 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, -5 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a4, -4 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -3 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b4, -4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, -3 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a6, -2 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a1, b1, t1
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a5, b5, t1
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a6, b6, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ and N, 15, I
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ ble I, $L18
|
|
+ .align 4
|
|
+
|
|
+#else
|
|
+
|
|
+ srl N, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ addl X, 8 * SIZE, X
|
|
+ subl I, 1, I
|
|
+
|
|
+ addl Y, 8 * SIZE, Y
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ addl X, 8 * SIZE, X
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a0, -8 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -7 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a2, -6 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -5 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a4, -4 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -3 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a6, -2 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -1 * SIZE(X)
|
|
+
|
|
+ addl Y, 8 * SIZE, Y
|
|
+ bgt I, $L12
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a1, b1, t1
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a5, b5, t1
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a6, b6, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ and N, 7, I
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ ble I, $L18
|
|
+ .align 4
|
|
+
|
|
+#endif
|
|
+
|
|
+$L16:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ addl X, SIZE, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ addl Y, SIZE, Y
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a0, b0, t2
|
|
+ subl I, 1, I
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ br $L999
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ srl N, 2, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b1, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b3, 0 * SIZE(Y)
|
|
+ subl I, 1, I
|
|
+
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a1, b1, t1
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b1, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b3, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ subl I, 1, I
|
|
+ bgt I, $L22
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a1, b1, t1
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a3, b3, t3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ and N, 3, I
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ ble I, $L28
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a0, b0, t2
|
|
+ subl I, 1, I
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s2, s3, $f3
|
|
+ fmov $f3, s2
|
|
+ fldd $f2, 0($sp)
|
|
+ ADD s0, s1, $f3
|
|
+ fmov $f3, s0
|
|
+ ADD s0, s2, $f3
|
|
+ fmov $f3, s0
|
|
+#ifndef ZYX20220111
|
|
+ fldd $f3, 8($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/dot.S.bak b/kernel/sw_64/dot.S.bak
|
|
new file mode 100644
|
|
index 0000000..cd96e21
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/dot.S.bak
|
|
@@ -0,0 +1,602 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+
|
|
+#define I $5
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f30
|
|
+#define s2 $f1
|
|
+#define s3 $f2
|
|
+
|
|
+#define a0 $f10
|
|
+#define a1 $f11
|
|
+#define a2 $f12
|
|
+#define a3 $f13
|
|
+#define a4 $f14
|
|
+#define a5 $f15
|
|
+#define a6 $f16
|
|
+#define a7 $f17
|
|
+
|
|
+#define b0 $f18
|
|
+#define b1 $f19
|
|
+#define b2 $f20
|
|
+#define b3 $f21
|
|
+#define b4 $f22
|
|
+#define b5 $f23
|
|
+#define b6 $f24
|
|
+#define b7 $f25
|
|
+
|
|
+#define t0 $f26
|
|
+#define t1 $f27
|
|
+#define t2 $f28
|
|
+#define t3 $f29
|
|
+
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ fclr s0
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr s1
|
|
+
|
|
+ fclr s2
|
|
+ nop
|
|
+ fclr s3
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr t0
|
|
+ cmpeq INCX, 1, $21
|
|
+ fclr t1
|
|
+ cmpeq INCY, 1, $22
|
|
+ fclr t2
|
|
+ and $21, $22, $22
|
|
+ fclr t3
|
|
+ beq $22, $L20
|
|
+
|
|
+#ifndef DOUBLE
|
|
+ srl N, 4, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ addl X, 16 * SIZE, X
|
|
+ subl I, 1, I
|
|
+
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ fillcs PREFETCHSIZE * 2 * SIZE(Y)
|
|
+ addl X, 16 * SIZE, X
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -10 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -9 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a0, -24 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -23 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b0, -8 * SIZE(Y)
|
|
+ MUL a2, b2, $f3
|
|
+ fmov $f3, t2
|
|
+ LD b1, -7 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a2, -22 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -21 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b2, -6 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, -5 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a4, -20 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -19 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b4, -4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, -3 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a6, -18 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -17 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a0, -16 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -15 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a2, -14 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -13 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a4, -12 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -11 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a6, -10 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -9 * SIZE(X)
|
|
+
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ bgt I, $L12
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6,-10 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -9 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a0, -8 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -7 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b0, -8 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, -7 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a2, -6 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -5 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b2, -6 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, -5 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a4, -4 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -3 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b4, -4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, -3 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a6, -2 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a1, b1, t1
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a5, b5, t1
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a6, b6, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ and N, 15, I
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ ble I, $L18
|
|
+ .align 4
|
|
+
|
|
+#else
|
|
+
|
|
+ srl N, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ addl X, 8 * SIZE, X
|
|
+ subl I, 1, I
|
|
+
|
|
+ addl Y, 8 * SIZE, Y
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ addl X, 8 * SIZE, X
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a0, -8 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -7 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a2, -6 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -5 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ LD a4, -4 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -3 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ LD a6, -2 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -1 * SIZE(X)
|
|
+
|
|
+ addl Y, 8 * SIZE, Y
|
|
+ bgt I, $L12
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a1, b1, t1
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a5, b5, t1
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a6, b6, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ and N, 7, I
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ ble I, $L18
|
|
+ .align 4
|
|
+
|
|
+#endif
|
|
+
|
|
+$L16:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ addl X, SIZE, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ addl Y, SIZE, Y
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a0, b0, t2
|
|
+ subl I, 1, I
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ br $L999
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ srl N, 2, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b1, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b3, 0 * SIZE(Y)
|
|
+ subl I, 1, I
|
|
+
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a1, b1, t1
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b1, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b3, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ subl I, 1, I
|
|
+ bgt I, $L22
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ MUL a1, b1, t1
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ MUL a3, b3, t3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD s0, t0, $f3
|
|
+ fmov $f3, s0
|
|
+ and N, 3, I
|
|
+ ADD s1, t1, $f3
|
|
+ fmov $f3, s1
|
|
+ ble I, $L28
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ MUL a0, b0, t2
|
|
+ subl I, 1, I
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD s2, t2, $f3
|
|
+ fmov $f3, s2
|
|
+ ADD s3, t3, $f3
|
|
+ fmov $f3, s3
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s2, s3, $f3
|
|
+ fmov $f3, s2
|
|
+ fldd $f2, 0($sp)
|
|
+ ADD s0, s1, $f3
|
|
+ fmov $f3, s0
|
|
+ ldi $sp, 16($sp)
|
|
+
|
|
+ ADD s0, s2, $f3
|
|
+ fmov $f3, s0
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/dot_simd.S b/kernel/sw_64/dot_simd.S
|
|
new file mode 100644
|
|
index 0000000..3e2288d
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/dot_simd.S
|
|
@@ -0,0 +1,634 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+
|
|
+#define I $5
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f30
|
|
+#define s2 $f1
|
|
+#define s3 $f2
|
|
+
|
|
+#define a0 $f10
|
|
+#define a1 $f11
|
|
+#define a2 $f12
|
|
+#define a3 $f13
|
|
+#define a4 $f14
|
|
+#define a5 $f15
|
|
+#define a6 $f16
|
|
+#define a7 $f17
|
|
+
|
|
+#define b0 $f18
|
|
+#define b1 $f19
|
|
+#define b2 $f20
|
|
+#define b3 $f21
|
|
+#define b4 $f22
|
|
+#define b5 $f23
|
|
+#define b6 $f24
|
|
+#define b7 $f25
|
|
+
|
|
+#define t0 $f26
|
|
+#define t1 $f27
|
|
+#define t2 $f28
|
|
+#define t3 $f29
|
|
+
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ fclr s0
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr s1
|
|
+
|
|
+ fclr s2
|
|
+ nop
|
|
+ fclr s3
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr t0
|
|
+ cmpeq INCX, 1, $21
|
|
+ fclr t1
|
|
+ cmpeq INCY, 1, $22
|
|
+ fclr t2
|
|
+ and $21, $22, $22
|
|
+ fclr t3
|
|
+ beq $22, $L20
|
|
+
|
|
+
|
|
+/*
|
|
+ test the address of Y & X
|
|
+*/
|
|
+ and Y, (VEC_LEN*SIZE-1), $4
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ or $3, $4, $4
|
|
+ bne $4, $UnAlign_ACCESS
|
|
+
|
|
+/*Align Accessing*/
|
|
+ sra N, 4, I
|
|
+ ble I, $Remain
|
|
+
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, s0 #clear s0 vector
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, s1
|
|
+
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, s2
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, s3
|
|
+
|
|
+ VLD b0, 0*VEC_LEN*SIZE(Y)
|
|
+ VLD b1, 1*VEC_LEN*SIZE(Y)
|
|
+ VLD b2, 2*VEC_LEN*SIZE(Y)
|
|
+ VLD b3, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ subl I, 1, I
|
|
+ ble I, $MainLoopEnd
|
|
+$MainLoop:
|
|
+ VMAD a0, b0, s0, s0
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ VMAD a1, b1, s1, s1
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+
|
|
+ subl I, 1, I
|
|
+ VMAD a2, b2, s2, s2
|
|
+ addl X, 16 * SIZE, X
|
|
+ VMAD a3, b3, s3, s3
|
|
+
|
|
+ VLD a0, -4*VEC_LEN*SIZE(X)
|
|
+ VLD a1, -3*VEC_LEN*SIZE(X)
|
|
+ VLD a2, -2*VEC_LEN*SIZE(X)
|
|
+ VLD a3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD b0, 0*VEC_LEN*SIZE(Y)
|
|
+ VLD b1, 1*VEC_LEN*SIZE(Y)
|
|
+ VLD b2, 2*VEC_LEN*SIZE(Y)
|
|
+ VLD b3, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+ VMAD a0, b0, s0, s0
|
|
+ VMAD a1, b1, s1, s1
|
|
+ VMAD a2, b2, s2, s2
|
|
+ VMAD a3, b3, s3, s3
|
|
+
|
|
+ VADD s0, s1, t0
|
|
+ VADD s2, s3, t1
|
|
+ nop
|
|
+ VADD t0, t1, s0
|
|
+
|
|
+ vextf s0, 1, s1
|
|
+ vextf s0, 2, s2
|
|
+ vextf s0, 3, s3
|
|
+ nop
|
|
+
|
|
+ ADD s0, s1, t2
|
|
+ ADD s2, s3, t3
|
|
+ nop
|
|
+ ADD t2, t3, s0
|
|
+
|
|
+ .align 4
|
|
+$Remain:
|
|
+ and N, 15, I
|
|
+ ble I, $End
|
|
+ .align 4
|
|
+$Remain_Loop:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ addl X, SIZE, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ addl Y, SIZE, Y
|
|
+
|
|
+ MAD a0, b0, s0, s0
|
|
+ subl I, 1, I
|
|
+ bgt I, $Remain_Loop
|
|
+ .align 4
|
|
+$End:
|
|
+
|
|
+ fldd $f2, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+/*UnAlign Accessing*/
|
|
+$UnAlign_ACCESS:
|
|
+
|
|
+#ifndef DOUBLE
|
|
+ srl N, 4, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ addl X, 16 * SIZE, X
|
|
+ subl I, 1, I
|
|
+
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ fillcs PREFETCHSIZE * 2 * SIZE(Y)
|
|
+ addl X, 16 * SIZE, X
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, -10 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -9 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a0, -24 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -23 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD b0, -8 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, -7 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, -22 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -21 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, -6 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, -5 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a4, -20 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -19 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD b4, -4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, -3 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a6, -18 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -17 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a0, -16 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -15 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, -14 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -13 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a4, -12 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -11 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a6, -10 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -9 * SIZE(X)
|
|
+
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ bgt I, $L12
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, s0
|
|
+ LD b6,-10 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -9 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a0, -8 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -7 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD b0, -8 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, -7 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, -6 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -5 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, -6 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, -5 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a4, -4 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -3 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD b4, -4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, -3 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a6, -2 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+ ADD s1, t1, s1
|
|
+ MUL a1, b1, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a5, b5, t1
|
|
+ ADD s2, t2, s2
|
|
+ MUL a6, b6, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, t0, s0
|
|
+ and N, 15, I
|
|
+ ADD s1, t1, s1
|
|
+ ble I, $L18
|
|
+ .align 4
|
|
+
|
|
+#else
|
|
+
|
|
+ srl N, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ addl X, 8 * SIZE, X
|
|
+ subl I, 1, I
|
|
+
|
|
+ addl Y, 8 * SIZE, Y
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ addl X, 8 * SIZE, X
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a0, -8 * SIZE(X)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, -7 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t2
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, -6 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, -5 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, 2 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 3 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a4, -4 * SIZE(X)
|
|
+ MUL a5, b5, t1
|
|
+ LD a5, -3 * SIZE(X)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD b4, 4 * SIZE(Y)
|
|
+ MUL a6, b6, t2
|
|
+ LD b5, 5 * SIZE(Y)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a6, -2 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, -1 * SIZE(X)
|
|
+
|
|
+ addl Y, 8 * SIZE, Y
|
|
+ bgt I, $L12
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, -2 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, -1 * SIZE(Y)
|
|
+ ADD s1, t1, s1
|
|
+ MUL a1, b1, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a5, b5, t1
|
|
+ ADD s2, t2, s2
|
|
+ MUL a6, b6, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, t0, s0
|
|
+ and N, 7, I
|
|
+ ADD s1, t1, s1
|
|
+ ble I, $L18
|
|
+ .align 4
|
|
+
|
|
+#endif
|
|
+
|
|
+$L16:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ addl X, SIZE, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ addl Y, SIZE, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a0, b0, t2
|
|
+ subl I, 1, I
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+ br $L999
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ srl N, 2, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b1, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b3, 0 * SIZE(Y)
|
|
+ subl I, 1, I
|
|
+
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD s0, t0, s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a1, b1, t1
|
|
+ ADD s2, t2, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b1, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b3, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ subl I, 1, I
|
|
+ bgt I, $L22
|
|
+ nop
|
|
+ fnop
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD s0, t0, s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a1, b1, t1
|
|
+ ADD s2, t2, s2
|
|
+ MUL a2, b2, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a3, b3, t3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD s0, t0, s0
|
|
+ and N, 3, I
|
|
+ ADD s1, t1, s1
|
|
+ ble I, $L28
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a0, b0, t2
|
|
+ subl I, 1, I
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s2, s3, s2
|
|
+ fldd $f2, 0($sp)
|
|
+ ADD s0, s1, s0
|
|
+ ldi $sp, 16($sp)
|
|
+
|
|
+ ADD s0, s2, s0
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/gemm_beta.S b/kernel/sw_64/gemm_beta.S
|
|
new file mode 100644
|
|
index 0000000..d9ea890
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/gemm_beta.S
|
|
@@ -0,0 +1,179 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+CNAME:
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $28, _mcount
|
|
+ jsr $28, ($28), _mcount
|
|
+#endif
|
|
+
|
|
+ ldl $18, 16($sp)
|
|
+ ble $16, $End
|
|
+ ldl $19, 24($sp)
|
|
+ ble $17, $End
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO)
|
|
+ .align 4
|
|
+
|
|
+$BETA_NE_ZERO:
|
|
+ sra $16, 3, $2 # i = (m >> 3)
|
|
+ mov $18, $1 # c_offset = c
|
|
+ ldi $17, -1($17) # j --
|
|
+ ble $2,$L52
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+ fillcs 64($1)
|
|
+ ldi $2, -1($2)
|
|
+
|
|
+ LD $f14, 0*SIZE($1)
|
|
+ LD $f15, 1*SIZE($1)
|
|
+ LD $f16, 2*SIZE($1)
|
|
+ LD $f17, 3*SIZE($1)
|
|
+ LD $f18, 4*SIZE($1)
|
|
+ LD $f11, 5*SIZE($1)
|
|
+ LD $f21, 6*SIZE($1)
|
|
+ LD $f22, 7*SIZE($1)
|
|
+
|
|
+ MUL $f19, $f14, $f23
|
|
+ MUL $f19, $f15, $f24
|
|
+ MUL $f19, $f16, $f25
|
|
+ MUL $f19, $f17, $f26
|
|
+ MUL $f19, $f18, $f27
|
|
+ MUL $f19, $f11, $f28
|
|
+ MUL $f19, $f21, $f29
|
|
+ MUL $f19, $f22, $f30
|
|
+
|
|
+ ST $f23, 0*SIZE($1)
|
|
+ ST $f24, 1*SIZE($1)
|
|
+ ST $f25, 2*SIZE($1)
|
|
+ ST $f26, 3*SIZE($1)
|
|
+ ST $f27, 4*SIZE($1)
|
|
+ ST $f28, 5*SIZE($1)
|
|
+ ST $f29, 6*SIZE($1)
|
|
+ ST $f30, 7*SIZE($1)
|
|
+
|
|
+ ldi $1,8*SIZE($1)
|
|
+ bgt $2,$L51
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ and $16, 7, $2
|
|
+ ble $2,$L54
|
|
+ .align 4
|
|
+
|
|
+$L53:
|
|
+ LD $f12, 0($1)
|
|
+ ldi $2, -1($2)
|
|
+ MUL $f19, $f12, $f23
|
|
+ ST $f23, 0($1)
|
|
+ ldi $1, SIZE($1)
|
|
+ bgt $2,$L53
|
|
+ .align 4
|
|
+
|
|
+$L54:
|
|
+ SXADDQ $19, $18, $18 # c += ldc
|
|
+ bgt $17,$BETA_NE_ZERO
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$BETA_EQ_ZERO:
|
|
+ sra $16, 3, $2 # i = (m >> 3)
|
|
+ ldi $4, 8*SIZE($18)
|
|
+ mov $18, $1 # c_offset = c
|
|
+ ldi $17, -1($17) # j --
|
|
+ ble $2,$L42
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+ ST $f31, 0*SIZE($1)
|
|
+ ST $f31, 1*SIZE($1)
|
|
+ ST $f31, 2*SIZE($1)
|
|
+ ST $f31, 3*SIZE($1)
|
|
+ ST $f31, 4*SIZE($1)
|
|
+ ST $f31, 5*SIZE($1)
|
|
+ ST $f31, 6*SIZE($1)
|
|
+ ST $f31, 7*SIZE($1)
|
|
+ ldi $2, -1($2)
|
|
+
|
|
+ ldi $4, 8*SIZE($4)
|
|
+ ldi $1, 8*SIZE($1)
|
|
+ bgt $2,$L41
|
|
+ .align 4
|
|
+
|
|
+$L42:
|
|
+ and $16, 7, $2
|
|
+ ble $2,$L44
|
|
+ .align 4
|
|
+
|
|
+$L43:
|
|
+ ldi $2, -1($2)
|
|
+ ST $f31, 0($1)
|
|
+ ldi $1, SIZE($1)
|
|
+ bgt $2, $L43
|
|
+ .align 4
|
|
+
|
|
+$L44:
|
|
+ SXADDQ $19, $18, $18 # c += ldc
|
|
+ bgt $17,$BETA_EQ_ZERO
|
|
+ clr $0
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/gemm_kernel_4x4.S b/kernel/sw_64/gemm_kernel_4x4.S
|
|
new file mode 100644
|
|
index 0000000..dd17554
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/gemm_kernel_4x4.S
|
|
@@ -0,0 +1,3244 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 96
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $22
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+#define C3 $25
|
|
+#define C4 $27
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define BB $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#define tmp $9
|
|
+
|
|
+#define ALPHA 64($sp)
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl C, 0 + STACKSIZE($sp)
|
|
+ ldl LDC, 8 + STACKSIZE($sp)
|
|
+#ifdef TRMMKERNEL
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp)
|
|
+#endif
|
|
+
|
|
+ SXADDQ LDC, 0, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ stl $9, 80($sp)
|
|
+ fstd $f19, ALPHA
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ subl $31, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 2, J
|
|
+ ble J, $L40
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ mov A, AO
|
|
+ s4addl K, 0, BB
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ addl C2, LDC, C3
|
|
+ s4addl LDC, C, C
|
|
+
|
|
+ SXADDQ BB, B, BB
|
|
+ fclr t1
|
|
+ addl C3, LDC, C4
|
|
+ fclr t2
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(EV5) || defined(SW6A)
|
|
+ fillcs 0 * SIZE(BB)
|
|
+ fillcs 8 * SIZE(BB)
|
|
+ unop
|
|
+ ldi BB, 16 * SIZE(BB)
|
|
+#endif
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, TMP1
|
|
+#else
|
|
+ addl KK, 4, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+#endif
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble L, $L15
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c16, t3,b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c15, t4,b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+/* 2 */
|
|
+ ADD c01, t1,b5
|
|
+ fmov b5, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD c02, t2,b5
|
|
+ fmov b5, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD c06, t3,b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD c03, t1,b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2,b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3,b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4,b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD c09, t1,b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ldi L, -2(L)
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ fldd alpha, ALPHA
|
|
+ MUL b1, a1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L18
|
|
+#else
|
|
+ blbs TMP1, $L18
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b5, 1 * SIZE(C1)
|
|
+ FIMOVD b5, tmp
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL b1, a3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b1, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a1, 0 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a2, 2 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b2, 3 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ldi I, -1(I)
|
|
+ MUL b3, a3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b3, 0 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a4, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 2 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b4, 3 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ unop
|
|
+ MUL alpha, c02, b5
|
|
+ fmov b5, c02
|
|
+#ifndef TRMMKERNEL
|
|
+ LD t1, 1 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL alpha, c03, b5
|
|
+ fmov b5, c03
|
|
+#ifndef TRMMKERNEL
|
|
+ LD t2, 2 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL alpha, c04, b5
|
|
+ fmov b5, c04
|
|
+#ifndef TRMMKERNEL
|
|
+ LD t3, 3 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c05, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a5, b5
|
|
+ fmov b5, c01
|
|
+ LD t4, 1 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c06, b5
|
|
+ fmov b5, c06
|
|
+#ifndef TRMMKERNEL
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ fstd b1, 88($sp)
|
|
+# FIMOVD b1, tmp
|
|
+ ADD c02, b5, b1
|
|
+ fmov b1, c02
|
|
+ fldd b1, 88($sp)
|
|
+# IFMOVD tmp, b1
|
|
+ LD a5, 2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c07, b5
|
|
+ fmov b5, c07
|
|
+#ifndef TRMMKERNEL
|
|
+ unop
|
|
+ ADD c03, a2, b5
|
|
+ fmov b5, c03
|
|
+ LD b5, 3 * SIZE(C4)
|
|
+ FIMOVD b5, tmp
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c08, b5
|
|
+ fmov b5, c08
|
|
+#ifndef TRMMKERNEL
|
|
+ unop
|
|
+ ADD c04, b2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c09, b5
|
|
+ fmov b5, c09
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c05, b1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c10, b5
|
|
+ fmov b5, c10
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c06, a4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c11, b5
|
|
+ fmov b5, c11
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c07, a3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c12, b5
|
|
+ fmov b5, c12
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c08, b4, b5
|
|
+ fmov b5, c08
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+
|
|
+ MUL alpha, c13, b5
|
|
+ fmov b5, c13
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c09, a1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c14, b5
|
|
+ fmov b5, c14
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c10, t1, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c15, b5
|
|
+ fmov b5, c15
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c11, t2, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c16, b5
|
|
+ fmov b5, c16
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c12, t3, b5
|
|
+ fmov b5, c12
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c13, b3, b5
|
|
+ fmov b5, c13
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ fclr t1
|
|
+ ldi C4, 4 * SIZE(C4)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ fclr t2
|
|
+ unop
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c15, a5, b5
|
|
+ fmov b5, c15
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ST c11, 2 * SIZE(C3)
|
|
+ fclr t3
|
|
+ unop
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ IFMOVD tmp, b5
|
|
+# FIMOVD b1, tmp
|
|
+ fstd b1, 88($sp)
|
|
+ ADD c16, b5, b1
|
|
+ fmov b1, c16
|
|
+ fldd b1, 88($sp)
|
|
+# IFMOVD tmp, b1
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ST c12, 3 * SIZE(C3)
|
|
+ fclr t4
|
|
+ ldi C3, 4 * SIZE(C3)
|
|
+
|
|
+ ST c13, -4 * SIZE(C4)
|
|
+ ST c14, -3 * SIZE(C4)
|
|
+ ST c15, -2 * SIZE(C4)
|
|
+ ST c16, -1 * SIZE(C4)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 4, TMP1
|
|
+#else
|
|
+ subl TMP1, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 2, I
|
|
+ ble I, $L30
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 4, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L28
|
|
+#else
|
|
+ blbs TMP1, $L28
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a4, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b5, 1 * SIZE(C2)
|
|
+ FIMOVD b5, tmp
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b1, 0 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b2, 1 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b3, 0 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b4, 1 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL alpha, c02, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ MUL alpha, c05, b5
|
|
+ fmov b5, c05
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL alpha, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL alpha, c09, b5
|
|
+ fmov b5, c09
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a3, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+ MUL alpha, c10, b5
|
|
+ fmov b5, c10
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c02, a4, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c13, b5
|
|
+ fmov b5, c13
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c05, a5, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+ MUL alpha, c14, b5
|
|
+ fmov b5, c14
|
|
+#ifndef TRMMKERNEL
|
|
+ IFMOVD tmp, b5
|
|
+ fstd b1, 88($sp)
|
|
+# FIMOVD b1, tmp
|
|
+ ADD c06, b5, b1
|
|
+ fmov b1, c06
|
|
+ fldd b1, 88($sp)
|
|
+# IFMOVD tmp, b1
|
|
+#endif
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c09, b1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+#endif
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ fclr t1
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c10, b2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+#endif
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ fclr t2
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c13, b3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+#endif
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ fclr t3
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c14, b4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+#endif
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ fclr t4
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ldi C3, 2 * SIZE(C3)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+ ldi C4, 2 * SIZE(C4)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and M, 1, I
|
|
+ ble I, $L39
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 4, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ ble L, $L35
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ ble L, $L35
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ LD b5, 3 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, b1, t1
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, b2, t2
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a2, b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ bgt L, $L32
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L38
|
|
+#else
|
|
+ blbs TMP1, $L38
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L38:
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a1, b3, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b5, 0 * SIZE(C2)
|
|
+ FIMOVD b5, tmp
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b4, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a2, 0 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 0 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL alpha, c05, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ MUL alpha, c09, b5
|
|
+ fmov b5, c09
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL alpha, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ IFMOVD tmp, b5
|
|
+ fstd b1, 88($sp)
|
|
+# FIMOVD b1, tmp
|
|
+ ADD c01, a5, b1
|
|
+ fmov b1, c01
|
|
+ ADD c05, b5, b1
|
|
+ fmov b1, c05
|
|
+ ADD c09, a2, b1
|
|
+ fmov b1, c09
|
|
+ ADD c13, a3, b1
|
|
+ fmov b1, c13
|
|
+ fldd b1, 88($sp)
|
|
+# IFMOVD tmp, b1
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 1, TMP1
|
|
+#else
|
|
+ subl TMP1, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L39:
|
|
+ mov BO, B
|
|
+ ldi J, -1(J)
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ and N, 2, J
|
|
+ ble J, $L80
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ mov A, AO
|
|
+ fclr t1
|
|
+ addl C2, LDC, C
|
|
+ fclr t2
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L60
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ble L, $L55
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L58
|
|
+#else
|
|
+ blbs TMP1, $L58
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c09, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c10, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c11, 2 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c12, 3 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c13, 0 * SIZE(C2)
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c14, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c15, 2 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c16, 3 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ldi I, -1(I)
|
|
+ MUL alpha, c02, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL alpha, c03, b5
|
|
+ fmov b5, c03
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ MUL alpha, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL alpha, c05, b5
|
|
+ fmov b5, c05
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, c09, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+ MUL alpha, c06, b5
|
|
+ fmov b5, c06
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c02, c10, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c07, b5
|
|
+ fmov b5, c07
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c03, c11, b5
|
|
+ fmov b5, c03
|
|
+#endif
|
|
+ MUL alpha, c08, b5
|
|
+ fmov b5, c08
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c04, c12, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c05, c13, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c06, c14, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c07, c15, b5
|
|
+ fmov b5, c07
|
|
+#endif
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c08, c16, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ fclr t1
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ fclr t2
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ fclr t3
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+ fclr t4
|
|
+
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 4, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L60:
|
|
+ and M, 2, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ble L, $L65
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ble L, $L65
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L62:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L62
|
|
+ .align 4
|
|
+
|
|
+$L65:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L68
|
|
+#else
|
|
+ blbs TMP1, $L68
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L68:
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c09, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c10, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c11, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c12, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ MUL alpha, c02, b5
|
|
+ fmov b5, c02
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL alpha, c05, b5
|
|
+ fmov b5, c05
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL alpha, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, c09, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c10, b5
|
|
+ fmov b5, c02
|
|
+ ADD c05, c11, b5
|
|
+ fmov b5, c05
|
|
+ ADD c06, c12, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+ ST c01, -2 * SIZE(C1)
|
|
+ fclr t1
|
|
+ ST c02, -1 * SIZE(C1)
|
|
+ fclr t2
|
|
+ ST c05, -2 * SIZE(C2)
|
|
+ fclr t3
|
|
+ ST c06, -1 * SIZE(C2)
|
|
+ fclr t4
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L70:
|
|
+ and M, 1, I
|
|
+ ble I, $L79
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ble L, $L75
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ble L, $L75
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L72:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t3, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ LD b4, 5 * SIZE(BO)
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L72
|
|
+ .align 4
|
|
+
|
|
+$L75:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L78
|
|
+#else
|
|
+ blbs TMP1, $L78
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L78:
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c02, t3, b5
|
|
+ fmov b5, c02
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b5, 0 * SIZE(C2)
|
|
+ FIMOVD b5, tmp
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, c02, b5
|
|
+ fmov b5, c01
|
|
+ ADD c05, c06, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL alpha, c05, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ IFMOVD tmp ,b5
|
|
+ fstd b1, 88($sp)
|
|
+# FIMOVD b1, tmp
|
|
+ ADD c01, a5, b1
|
|
+ fmov b1, c01
|
|
+ ADD c05, b5, b1
|
|
+ fmov b1, c05
|
|
+ fldd b1, 88($sp)
|
|
+# IFMOVD tmp ,b1
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 1, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L79:
|
|
+ mov BO, B
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ unop
|
|
+ unop
|
|
+ .align 4
|
|
+
|
|
+$L80:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+ mov C, C1
|
|
+ mov A, AO
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L100
|
|
+ .align 4
|
|
+
|
|
+$L91:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L92:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi L, -1(L)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 12 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD a2, 13 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 14 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b3, t4
|
|
+ LD a5, 15 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, 16 * SIZE(AO)
|
|
+ ldi AO, 16 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L92
|
|
+ .align 4
|
|
+
|
|
+$L95:
|
|
+#ifndef TRMMKERNEL
|
|
+ and K, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ fldd alpha, ALPHA
|
|
+ unop
|
|
+ ble L, $L98
|
|
+ .align 4
|
|
+
|
|
+$L96:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ bgt L, $L96
|
|
+ .align 4
|
|
+
|
|
+$L98:
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD c05, 0 * SIZE(C1)
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ LD c06, 1 * SIZE(C1)
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD c07, 2 * SIZE(C1)
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ LD c08, 3 * SIZE(C1)
|
|
+#else
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL alpha, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL alpha, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL alpha, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, c05, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c06, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, c07, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, c08, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 4, TMP1
|
|
+#else
|
|
+ subl TMP1, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L91
|
|
+ .align 4
|
|
+
|
|
+$L100:
|
|
+ and M, 2, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L110
|
|
+ .align 4
|
|
+
|
|
+$L101:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L105
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L105
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L102:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 7 * SIZE(AO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L102
|
|
+ .align 4
|
|
+
|
|
+$L105:
|
|
+#ifndef TRMMKERNEL
|
|
+ and K, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ fldd alpha, ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 0 * SIZE(C1)
|
|
+ LD a4, 1 * SIZE(C1)
|
|
+#endif
|
|
+ ble L, $L108
|
|
+ .align 4
|
|
+
|
|
+$L106:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ unop
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L106
|
|
+ .align 4
|
|
+
|
|
+$L108:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ fclr t1
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ fclr t2
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ fclr t3
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ fclr t4
|
|
+
|
|
+ ADD c01, c03, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c04, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL alpha, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a3, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, a4, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L110:
|
|
+ and M, 1, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L111:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L112:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b4, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ bgt L, $L112
|
|
+ .align 4
|
|
+
|
|
+$L115:
|
|
+#ifndef TRMMKERNEL
|
|
+ and K, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ fldd alpha, ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a2, 0 * SIZE(C1)
|
|
+#endif
|
|
+ ble L, $L118
|
|
+ .align 4
|
|
+
|
|
+$L116:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L116
|
|
+ .align 4
|
|
+
|
|
+$L118:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c01, c02, b5
|
|
+ fmov b5, c01
|
|
+ ADD c03, c04, b5
|
|
+ fmov b5, c03
|
|
+ ADD c01, c03, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ MUL alpha, c01, b5
|
|
+ fmov b5, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a2, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl $9, 80($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/gemm_kernel_4x4.S.bak b/kernel/sw_64/gemm_kernel_4x4.S.bak
|
|
new file mode 100644
|
|
index 0000000..10dc98d
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/gemm_kernel_4x4.S.bak
|
|
@@ -0,0 +1,2844 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW2B
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP nop
|
|
+#endif
|
|
+
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $22
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+#define C3 $25
|
|
+#define C4 $27
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define BB $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#define ALPHA 64($sp)
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl C, 0 + STACKSIZE($sp)
|
|
+ ldl LDC, 8 + STACKSIZE($sp)
|
|
+#ifdef TRMMKERNEL
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp)
|
|
+#endif
|
|
+
|
|
+ SXADDQ LDC, 0, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ fstd $f19, ALPHA
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ subl $31, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 2, J
|
|
+ ble J, $L40
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ mov A, AO
|
|
+ s4addl K, 0, BB
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ addl C2, LDC, C3
|
|
+ s4addl LDC, C, C
|
|
+
|
|
+ SXADDQ BB, B, BB
|
|
+ fclr t1
|
|
+ addl C3, LDC, C4
|
|
+ fclr t2
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(EV5) || defined(EV6) || defined(SW2B)
|
|
+ fillcs 0 * SIZE(BB)
|
|
+ fillcs 8 * SIZE(BB)
|
|
+ unop
|
|
+ ldi BB, 16 * SIZE(BB)
|
|
+#endif
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, TMP1
|
|
+#else
|
|
+ addl KK, 4, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+#endif
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble L, $L15
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD c11, t1, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD c01, t1, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD c11, t1, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD c03, t1, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD c11, t1, c11
|
|
+ fldd alpha, ALPHA
|
|
+ MUL b1, a1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L18
|
|
+#else
|
|
+ blbs TMP1, $L18
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c11, t1, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ADD c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b5, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL b1, a3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b1, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a1, 0 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a2, 2 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b2, 3 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ ldi I, -1(I)
|
|
+ MUL b3, a3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b3, 0 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a4, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 2 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c11, t1, c11
|
|
+ unop
|
|
+ MUL alpha, c01, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b4, 3 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ unop
|
|
+ MUL alpha, c02, c02
|
|
+#ifndef TRMMKERNEL
|
|
+ LD t1, 1 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL alpha, c03, c03
|
|
+#ifndef TRMMKERNEL
|
|
+ LD t2, 2 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL alpha, c04, c04
|
|
+#ifndef TRMMKERNEL
|
|
+ LD t3, 3 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c05, c05
|
|
+ unop
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a5, c01
|
|
+ LD t4, 1 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c06, c06
|
|
+#ifndef TRMMKERNEL
|
|
+ unop
|
|
+ ADD c02, b5, c02
|
|
+ LD a5, 2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c07, c07
|
|
+#ifndef TRMMKERNEL
|
|
+ unop
|
|
+ ADD c03, a2, c03
|
|
+ LD b5, 3 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c08, c08
|
|
+#ifndef TRMMKERNEL
|
|
+ unop
|
|
+ ADD c04, b2, c04
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c09, c09
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c05, b1, c05
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c10, c10
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c06, a4, c06
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c11, c11
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c07, a3, c07
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c12, c12
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c08, b4, c08
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+
|
|
+ MUL alpha, c13, c13
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c09, a1, c09
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c14, c14
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c10, t1, c10
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c15, c15
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c11, t2, c11
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c16, c16
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c12, t3, c12
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c13, b3, c13
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ fclr t1
|
|
+ ldi C4, 4 * SIZE(C4)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c14, t4, c14
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ fclr t2
|
|
+ unop
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c15, a5, c15
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ST c11, 2 * SIZE(C3)
|
|
+ fclr t3
|
|
+ unop
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c16, b5, c16
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ ST c12, 3 * SIZE(C3)
|
|
+ fclr t4
|
|
+ ldi C3, 4 * SIZE(C3)
|
|
+
|
|
+ ST c13, -4 * SIZE(C4)
|
|
+ ST c14, -3 * SIZE(C4)
|
|
+ ST c15, -2 * SIZE(C4)
|
|
+ ST c16, -1 * SIZE(C4)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 4, TMP1
|
|
+#else
|
|
+ subl TMP1, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 2, I
|
|
+ ble I, $L30
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 4, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD c09, t1, c09
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L28
|
|
+#else
|
|
+ blbs TMP1, $L28
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a4, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b5, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b1, 0 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b2, 1 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b3, 0 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL alpha, c01, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b4, 1 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL alpha, c02, c02
|
|
+ unop
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ MUL alpha, c05, c05
|
|
+ ADD c14, t4, c14
|
|
+ MUL alpha, c06, c06
|
|
+
|
|
+ MUL alpha, c09, c09
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a3, c01
|
|
+#endif
|
|
+ MUL alpha, c10, c10
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c02, a4, c02
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c13, c13
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c05, a5, c05
|
|
+#endif
|
|
+ MUL alpha, c14, c14
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c06, b5, c06
|
|
+#endif
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c09, b1, c09
|
|
+ unop
|
|
+#endif
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ fclr t1
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c10, b2, c10
|
|
+ unop
|
|
+#endif
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ fclr t2
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c13, b3, c13
|
|
+ unop
|
|
+#endif
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ fclr t3
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c14, b4, c14
|
|
+ unop
|
|
+#endif
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ fclr t4
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ldi C3, 2 * SIZE(C3)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+ ldi C4, 2 * SIZE(C4)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and M, 1, I
|
|
+ ble I, $L39
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 4, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ ble L, $L35
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ ble L, $L35
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ LD b5, 3 * SIZE(BO)
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a2, b1, t1
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a2, b2, t2
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a2, b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ bgt L, $L32
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ ADD c01, t1, c01
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L38
|
|
+#else
|
|
+ blbs TMP1, $L38
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L38:
|
|
+ ADD c05, t2, c05
|
|
+ unop
|
|
+ MUL a1, b2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ unop
|
|
+ MUL a1, b3, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b5, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL a1, b4, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a2, 0 * SIZE(C3)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL alpha, c01, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 0 * SIZE(C4)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ unop
|
|
+ MUL alpha, c05, c05
|
|
+ unop
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ MUL alpha, c09, c09
|
|
+ ADD c13, t4, c13
|
|
+ MUL alpha, c13, c13
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a5, c01
|
|
+ ADD c05, b5, c05
|
|
+ ADD c09, a2, c09
|
|
+ ADD c13, a3, c13
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 1, TMP1
|
|
+#else
|
|
+ subl TMP1, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L39:
|
|
+ mov BO, B
|
|
+ ldi J, -1(J)
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ and N, 2, J
|
|
+ ble J, $L80
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ mov A, AO
|
|
+ fclr t1
|
|
+ addl C2, LDC, C
|
|
+ fclr t2
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L60
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ble L, $L55
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD c05, t1, c05
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L58
|
|
+#else
|
|
+ blbs TMP1, $L58
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+ ADD c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c09, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c10, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c11, 2 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c12, 3 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c13, 0 * SIZE(C2)
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c14, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c15, 2 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL alpha, c01, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c16, 3 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ ldi I, -1(I)
|
|
+ MUL alpha, c02, c02
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ MUL alpha, c03, c03
|
|
+ ADD c08, t4, c08
|
|
+ MUL alpha, c04, c04
|
|
+
|
|
+ MUL alpha, c05, c05
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, c09, c01
|
|
+#endif
|
|
+ MUL alpha, c06, c06
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c02, c10, c02
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c07, c07
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c03, c11, c03
|
|
+#endif
|
|
+ MUL alpha, c08, c08
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c04, c12, c04
|
|
+#endif
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c05, c13, c05
|
|
+#endif
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c06, c14, c06
|
|
+#endif
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c07, c15, c07
|
|
+#endif
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c08, c16, c08
|
|
+#endif
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ fclr t1
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ fclr t2
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ fclr t3
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+ fclr t4
|
|
+
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 4, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L60:
|
|
+ and M, 2, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ble L, $L65
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ble L, $L65
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L62:
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L62
|
|
+ .align 4
|
|
+
|
|
+$L65:
|
|
+ ADD c01, t1, c01
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L68
|
|
+#else
|
|
+ blbs TMP1, $L68
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L68:
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c09, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c10, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c11, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL alpha, c01, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c12, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ MUL alpha, c02, c02
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ MUL alpha, c05, c05
|
|
+ ADD c06, t4, c06
|
|
+ MUL alpha, c06, c06
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, c09, c01
|
|
+ ADD c02, c10, c02
|
|
+ ADD c05, c11, c05
|
|
+ ADD c06, c12, c06
|
|
+#endif
|
|
+
|
|
+ ST c01, -2 * SIZE(C1)
|
|
+ fclr t1
|
|
+ ST c02, -1 * SIZE(C1)
|
|
+ fclr t2
|
|
+ ST c05, -2 * SIZE(C2)
|
|
+ fclr t3
|
|
+ ST c06, -1 * SIZE(C2)
|
|
+ fclr t4
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L70:
|
|
+ and M, 1, I
|
|
+ ble I, $L79
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ble L, $L75
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ble L, $L75
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L72:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t3, c02
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ LD b4, 5 * SIZE(BO)
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L72
|
|
+ .align 4
|
|
+
|
|
+$L75:
|
|
+ ADD c01, t1, c01
|
|
+ fldd alpha, ALPHA
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L78
|
|
+#else
|
|
+ blbs TMP1, $L78
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L78:
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c02, t3, c02
|
|
+ ADD c06, t4, c06
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b5, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c01, c02, c01
|
|
+ ADD c05, c06, c05
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c05, t2, c05
|
|
+
|
|
+ MUL alpha, c01, c01
|
|
+ MUL alpha, c05, c05
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a5, c01
|
|
+ ADD c05, b5, c05
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 1, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L79:
|
|
+ mov BO, B
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ unop
|
|
+ unop
|
|
+ .align 4
|
|
+
|
|
+$L80:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+ mov C, C1
|
|
+ mov A, AO
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L100
|
|
+ .align 4
|
|
+
|
|
+$L91:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L92:
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi L, -1(L)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 12 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD a2, 13 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 14 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b3, t4
|
|
+ LD a5, 15 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, 16 * SIZE(AO)
|
|
+ ldi AO, 16 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L92
|
|
+ .align 4
|
|
+
|
|
+$L95:
|
|
+#ifndef TRMMKERNEL
|
|
+ and K, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ fldd alpha, ALPHA
|
|
+ unop
|
|
+ ble L, $L98
|
|
+ .align 4
|
|
+
|
|
+$L96:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ bgt L, $L96
|
|
+ .align 4
|
|
+
|
|
+$L98:
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, t1, c01
|
|
+ LD c05, 0 * SIZE(C1)
|
|
+ ADD c02, t2, c02
|
|
+ LD c06, 1 * SIZE(C1)
|
|
+ ADD c03, t3, c03
|
|
+ LD c07, 2 * SIZE(C1)
|
|
+ ADD c04, t4, c04
|
|
+ LD c08, 3 * SIZE(C1)
|
|
+#else
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+#endif
|
|
+
|
|
+ MUL alpha, c01, c01
|
|
+ MUL alpha, c02, c02
|
|
+ MUL alpha, c03, c03
|
|
+ MUL alpha, c04, c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, c05, c01
|
|
+ ADD c02, c06, c02
|
|
+ ADD c03, c07, c03
|
|
+ ADD c04, c08, c04
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 4, TMP1
|
|
+#else
|
|
+ subl TMP1, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L91
|
|
+ .align 4
|
|
+
|
|
+$L100:
|
|
+ and M, 2, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L110
|
|
+ .align 4
|
|
+
|
|
+$L101:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L105
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L105
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L102:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 7 * SIZE(AO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L102
|
|
+ .align 4
|
|
+
|
|
+$L105:
|
|
+#ifndef TRMMKERNEL
|
|
+ and K, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ fldd alpha, ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 0 * SIZE(C1)
|
|
+ LD a4, 1 * SIZE(C1)
|
|
+#endif
|
|
+ ble L, $L108
|
|
+ .align 4
|
|
+
|
|
+$L106:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ unop
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L106
|
|
+ .align 4
|
|
+
|
|
+$L108:
|
|
+ ADD c01, t1, c01
|
|
+ fclr t1
|
|
+ ADD c02, t2, c02
|
|
+ fclr t2
|
|
+ ADD c03, t3, c03
|
|
+ fclr t3
|
|
+ ADD c04, t4, c04
|
|
+ fclr t4
|
|
+
|
|
+ ADD c01, c03, c01
|
|
+ ADD c02, c04, c02
|
|
+
|
|
+ MUL alpha, c01, c01
|
|
+ MUL alpha, c02, c02
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a3, c01
|
|
+ ADD c02, a4, c02
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L110:
|
|
+ and M, 1, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L111:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#else
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ sra K, 2, L
|
|
+#else
|
|
+ sra TMP1, 2, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L112:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b4, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ bgt L, $L112
|
|
+ .align 4
|
|
+
|
|
+$L115:
|
|
+#ifndef TRMMKERNEL
|
|
+ and K, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ fldd alpha, ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a2, 0 * SIZE(C1)
|
|
+#endif
|
|
+ ble L, $L118
|
|
+ .align 4
|
|
+
|
|
+$L116:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L116
|
|
+ .align 4
|
|
+
|
|
+$L118:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+ ADD c01, c02, c01
|
|
+ ADD c03, c04, c03
|
|
+ ADD c01, c03, c01
|
|
+
|
|
+ MUL alpha, c01, c01
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c01, a2, c01
|
|
+#endif
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/gemm_kernel_simd_16x4.S b/kernel/sw_64/gemm_kernel_simd_16x4.S
|
|
new file mode 100644
|
|
index 0000000..1acf679
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/gemm_kernel_simd_16x4.S
|
|
@@ -0,0 +1,4054 @@
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+
|
|
+#define STACKSIZE 336
|
|
+
|
|
+#define CO $1
|
|
+#define C1 $2
|
|
+#define C2 $3
|
|
+#define C3 $4
|
|
+
|
|
+#define LDM $5
|
|
+
|
|
+#define PREB $7
|
|
+#define SPANA $8
|
|
+#define SPANB $9
|
|
+#define NC1 $10
|
|
+#define KC1 $11
|
|
+#define MC1 $12
|
|
+#define PREA $13
|
|
+
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $19
|
|
+#define MC $16
|
|
+#define NC $17
|
|
+#define KC $18
|
|
+
|
|
+#define A1 $22
|
|
+#define B1 $23
|
|
+
|
|
+#define ALPHA $f8
|
|
+
|
|
+#define a0 $f0
|
|
+#define a4 $f1
|
|
+#define a8 $f2
|
|
+#define a12 $f3
|
|
+
|
|
+#define b0 $f4
|
|
+#define b1 $f5
|
|
+#define b2 $f6
|
|
+#define b3 $f7
|
|
+
|
|
+#define na0 $f0
|
|
+#define na4 $f8
|
|
+#define na8 $f9
|
|
+#define na12 $f10
|
|
+
|
|
+#define nb0 $f11
|
|
+#define nb1 $f12
|
|
+#define nb2 $f13
|
|
+#define nb3 $f14
|
|
+
|
|
+#define t00 $f15
|
|
+#define t01 $f16
|
|
+#define t02 $f17
|
|
+#define t03 $f18
|
|
+#define t04 $f19
|
|
+#define t05 $f20
|
|
+#define t06 $f21
|
|
+#define t07 $f22
|
|
+#define t08 $f23
|
|
+#define t09 $f24
|
|
+#define t10 $f25
|
|
+#define t11 $f26
|
|
+#define t12 $f27
|
|
+#define t13 $f28
|
|
+#define t14 $f29
|
|
+#define t15 $f30
|
|
+
|
|
+#define c00 $f1
|
|
+#define c01 $f2
|
|
+#define c02 $f3
|
|
+#define c03 $f4
|
|
+
|
|
+#define c04 $f5
|
|
+#define c05 $f6
|
|
+#define c06 $f7
|
|
+#define c07 $f9
|
|
+
|
|
+#define c08 $f10
|
|
+#define c09 $f11
|
|
+#define c10 $f12
|
|
+#define c11 $f13
|
|
+
|
|
+#define c12 $f1
|
|
+#define c13 $f2
|
|
+#define c14 $f3
|
|
+#define c15 $f4
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#define TEMP $14
|
|
+#define KK $24
|
|
+#define OFFSET $25
|
|
+#endif
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+.frame $30,STACKSIZE,$26,0
|
|
+ldi $sp,-STACKSIZE($sp) # # [2]
|
|
+
|
|
+ stl $9,328($sp) # Integer Saved Register
|
|
+ stl $10,320($sp)
|
|
+ stl $11,312($sp)
|
|
+ stl $12,304($sp)
|
|
+ stl $13,296($sp)
|
|
+ stl $14,288($sp)
|
|
+
|
|
+
|
|
+ ST $f2,280($sp) # Float Saved Register
|
|
+ ST $f3,272($sp)
|
|
+ ST $f4,264($sp)
|
|
+ ST $f5,256($sp)
|
|
+ ST $f6,248($sp)
|
|
+ ST $f7,240($sp)
|
|
+ ST $f8,232($sp)
|
|
+ ST $f9,224($sp)
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+$Begin_NC_Unroll4:
|
|
+ ldl C, 0 + STACKSIZE($sp) # load C
|
|
+ ldl LDM, 8 + STACKSIZE($sp) # load ldm
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp) # load offset
|
|
+ nop
|
|
+#endif
|
|
+
|
|
+ ST $f19, 192($sp) # store alpha
|
|
+ SXADDQ LDM, 0, LDM # ldm*X+0
|
|
+
|
|
+ mov NC, NC1 # backup nc
|
|
+ mov KC, KC1 # backup kc
|
|
+ mov MC, MC1 # backup mc
|
|
+
|
|
+ mov B, B1 # backup the initial address of b
|
|
+ sra NC1,2,NC # NC=NC1/4 Unroll N 4
|
|
+
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ subl $31, OFFSET, KK # when trmm at right
|
|
+ nop
|
|
+#endif
|
|
+
|
|
+ mov A, A1 # backup the initial address of a
|
|
+ sll KC1,1+BASE_SHIFT,SPANB # kc*2nr
|
|
+
|
|
+ sll KC1,4+BASE_SHIFT,SPANA # kc*16mr
|
|
+ beq NC,$Begin_NC_Unroll2
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+.L0:
|
|
+ sra MC1,4,MC # MC=MC1/16
|
|
+ mov C, CO # compute c pointer
|
|
+
|
|
+ addl B1,SPANB,PREB # prefetch B
|
|
+ addl A1,SPANA,PREA # prefetch A
|
|
+
|
|
+ addl C, LDM, C1
|
|
+ addl C1,LDM, C2
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET,KK # Reset the left offset
|
|
+ nop
|
|
+#endif
|
|
+
|
|
+ subl PREA,16*SIZE,PREA # prea=kc1*mc-mc
|
|
+ addl C2,LDM, C3
|
|
+
|
|
+ s4addl LDM,C,C # C=ldm*4+C
|
|
+ beq MC,.L15 # MC=0:MC1<16
|
|
+
|
|
+
|
|
+ .align 5 # nr=4,mr=4-----------------------------
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B # LL && RU reset B
|
|
+ nop
|
|
+#else
|
|
+ sll KK, 4 + BASE_SHIFT, KC # KK*16
|
|
+ sll KK, 2 + BASE_SHIFT, TEMP # KK*4
|
|
+
|
|
+ addl A, KC, A # mov A point to the data part
|
|
+ addl B1,TEMP,B # mov B point to the data part
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # CLEAR Results Register
|
|
+ fillcs 0(CO) # prefetch C
|
|
+ fillcs 0(C1)
|
|
+
|
|
+ vcpys $f31,$f31,t01 # 64 results
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+ vcpys $f31,$f31,t02
|
|
+ LDDE b0,0*SIZE(B)
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ vcpys $f31,$f31,t03
|
|
+ LDDE b2,2*SIZE(B)
|
|
+ LDDE b3,3*SIZE(B)
|
|
+
|
|
+ vcpys $f31,$f31,t04
|
|
+ fillcs 4(CO) # prefetch C
|
|
+ fillcs 4(C1)
|
|
+
|
|
+ vcpys $f31,$f31,t05
|
|
+ fillcs 4(C2)
|
|
+ fillcs 4(C3)
|
|
+
|
|
+ vcpys $f31,$f31,t06
|
|
+ VLD a0, 0*SIZE(A)
|
|
+ VLD a4, 4*SIZE(A)
|
|
+
|
|
+ vcpys $f31,$f31,t07
|
|
+ VLD a8, 8*SIZE(A)
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ vcpys $f31,$f31,t08
|
|
+ fillcs 8*SIZE(CO)
|
|
+ fillcs 8*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,t09
|
|
+ fillcs 8*SIZE(C2)
|
|
+ fillcs 8*SIZE(C3)
|
|
+
|
|
+ vcpys $f31,$f31,t10
|
|
+ fillcs 12*SIZE(CO)
|
|
+ fillcs 12*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,t11
|
|
+ fillcs 12*SIZE(C2)
|
|
+ fillcs 12*SIZE(C3)
|
|
+
|
|
+ vcpys $f31,$f31,t12
|
|
+ vcpys $f31,$f31,t13
|
|
+ vcpys $f31,$f31,t14
|
|
+ vcpys $f31,$f31,t15
|
|
+
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) \
|
|
+ ||(!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP # temp is the length of data part
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 16, TEMP # mr=16
|
|
+#else
|
|
+ addl KK, 4, TEMP # right nr=4
|
|
+#endif
|
|
+ sra TEMP, 1, KC # KC=TEMP/2
|
|
+
|
|
+ nop
|
|
+ beq KC, $Rest_16x4x1
|
|
+
|
|
+#else
|
|
+
|
|
+ vcpys $f31,$f31,t00 # CLEAR Results Register
|
|
+ mov B1,B # Reset B
|
|
+ sra KC1,1,KC # Unroll Kr=2, KC=KC1/2
|
|
+
|
|
+ vcpys $f31,$f31,t01 # 64 results
|
|
+ fillcs 0(CO) # prefetch C
|
|
+ fillcs 0(C1)
|
|
+
|
|
+ vcpys $f31,$f31,t02
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+ vcpys $f31,$f31,t03
|
|
+ LDDE b0,0*SIZE(B)
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ vcpys $f31,$f31,t04
|
|
+ LDDE b2,2*SIZE(B)
|
|
+ LDDE b3,3*SIZE(B)
|
|
+
|
|
+ vcpys $f31,$f31,t05
|
|
+ fillcs 4(CO) # prefetch C
|
|
+ fillcs 4(C1)
|
|
+
|
|
+ vcpys $f31,$f31,t06
|
|
+ fillcs 4(C2)
|
|
+ fillcs 4(C3)
|
|
+
|
|
+ vcpys $f31,$f31,t07
|
|
+ VLD a0, 0*SIZE(A)
|
|
+ VLD a4, 4*SIZE(A)
|
|
+
|
|
+ vcpys $f31,$f31,t08
|
|
+ VLD a8, 8*SIZE(A)
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ vcpys $f31,$f31,t09
|
|
+ fillcs 8(CO) # prefetch C
|
|
+ fillcs 8(C1)
|
|
+
|
|
+ vcpys $f31,$f31,t10
|
|
+ fillcs 8(C2)
|
|
+ fillcs 8(C3)
|
|
+
|
|
+ vcpys $f31,$f31,t11
|
|
+ fillcs 12*SIZE(CO)
|
|
+ fillcs 12*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,t12
|
|
+ fillcs 12*SIZE(C2)
|
|
+ fillcs 12*SIZE(C3)
|
|
+
|
|
+ vcpys $f31,$f31,t13
|
|
+ vcpys $f31,$f31,t14
|
|
+
|
|
+ vcpys $f31,$f31,t15
|
|
+ beq KC,$Rest_16x4x1 # KC1<2 goto $Rest_16x4x1
|
|
+
|
|
+#endif
|
|
+
|
|
+ .align 5
|
|
+
|
|
+$Panel_16x4x2: # nr=4,mr=4,kr=2------------------------
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ addl A,16*SIZE,A # 16a*1k
|
|
+ LDDE nb0,4*SIZE(B) # get next 4b
|
|
+
|
|
+ VMAD a0,b1,t04,t04
|
|
+ LDDE nb1,5*SIZE(B)
|
|
+
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VLD na12,12*SIZE(A)
|
|
+
|
|
+ VMAD a4,b1,t05,t05
|
|
+ VLD na8,8*SIZE(A)
|
|
+
|
|
+ VMAD a0,b2,t08,t08
|
|
+ LDDE nb2,6*SIZE(B)
|
|
+
|
|
+ VMAD a0,b3,t12,t12
|
|
+ LDDE nb3,7*SIZE(B)
|
|
+
|
|
+ VMAD a8,b0,t02,t02
|
|
+ VMAD a8,b1,t06,t06
|
|
+
|
|
+ VMAD a4,b2,t09,t09
|
|
+ addl B,8*SIZE,B # 4b*2k
|
|
+ VLD na0,0*SIZE(A) # carefule na0=a0 use the same register
|
|
+
|
|
+ VMAD a4,b3,t13,t13
|
|
+ VLD na4,4*SIZE(A) # get next 16a
|
|
+
|
|
+ VMAD a12,b0,t03,t03
|
|
+ VMAD a12,b1,t07,t07
|
|
+
|
|
+ VMAD a8,b2,t10,t10
|
|
+ fillcs 0(PREB)
|
|
+
|
|
+ VMAD a8,b3,t14,t14
|
|
+ fillcs 0(PREA)
|
|
+
|
|
+ VMAD a12,b2,t11,t11
|
|
+ fillcs 8*SIZE(PREA)
|
|
+
|
|
+ VMAD a12,b3,t15,t15
|
|
+ subl KC,1,KC # loop k --
|
|
+
|
|
+
|
|
+ VMAD na12,nb0,t03,t03
|
|
+ addl A,16*SIZE,A # ### next k ###
|
|
+ LDDE b0,0(B) # get 3rd 4b
|
|
+
|
|
+ VMAD na12,nb1,t07,t07
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ VMAD na8,nb0,t02,t02
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ VMAD na8,nb1,t06,t06
|
|
+ VLD a8,8*SIZE(A)
|
|
+
|
|
+ VMAD na0,nb0,t00,t00
|
|
+ subl PREA,16*SIZE,PREA # prea-=16
|
|
+ LDDE b2,2*SIZE(B)
|
|
+
|
|
+ VMAD na0,nb1,t04,t04
|
|
+ LDDE b3,3*SIZE(B)
|
|
+
|
|
+ VMAD na12,nb2,t11,t11
|
|
+ VMAD na12,nb3,t15,t15
|
|
+ VMAD na8,nb2,t10,t10
|
|
+ VMAD na8,nb3,t14,t14
|
|
+
|
|
+ VMAD na0,nb2,t08,t08
|
|
+ fillcs 0(PREA)
|
|
+
|
|
+ VMAD na0,nb3,t12,t12
|
|
+ fillcs 4*SIZE(PREB)
|
|
+
|
|
+ VMAD na4,nb0,t01,t01
|
|
+ VLD a0,0(A) # get 3rd 16a
|
|
+
|
|
+ VMAD na4,nb1,t05,t05
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ VMAD na4,nb2,t09,t09
|
|
+ fillcs 8*SIZE(PREA)
|
|
+ addl PREB,8*SIZE,PREB # preb+=8
|
|
+
|
|
+ VMAD na4,nb3,t13,t13
|
|
+ subl PREA,16*SIZE,PREA # prea-=16
|
|
+ bne KC,$Panel_16x4x2
|
|
+
|
|
+
|
|
+$Rest_16x4x1:
|
|
+ LDDE ALPHA, 192($sp) # get alpha
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1, $Write_16x4
|
|
+#else
|
|
+ blbc TEMP,$Write_16x4
|
|
+#endif
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ addl A,16*SIZE,A # 16a*1k
|
|
+
|
|
+ VMAD a0,b1,t04,t04
|
|
+ addl B,4*SIZE,B # 4b*1k
|
|
+
|
|
+ VMAD a0,b2,t08,t08
|
|
+ VMAD a0,b3,t12,t12
|
|
+
|
|
+
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VMAD a4,b1,t05,t05
|
|
+ VMAD a4,b2,t09,t09
|
|
+ VMAD a4,b3,t13,t13
|
|
+
|
|
+ VMAD a8,b0,t02,t02
|
|
+ VMAD a8,b1,t06,t06
|
|
+ VMAD a8,b2,t10,t10
|
|
+ VMAD a8,b3,t14,t14
|
|
+
|
|
+ VMAD a12,b0,t03,t03
|
|
+ VMAD a12,b1,t07,t07
|
|
+ VMAD a12,b2,t11,t11
|
|
+ VMAD a12,b3,t15,t15
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+$Write_16x4:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1), $6 ### gemm part ####
|
|
+ bne $6, $UnAlign_CO_Access_16x4
|
|
+
|
|
+$Align_CO_Access_16x4:
|
|
+ VLD c00,0(CO)
|
|
+ VLD c01,4*SIZE(CO)
|
|
+ VLD c02,8*SIZE(CO)
|
|
+ VLD c03,12*SIZE(CO)
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+ VMAD t02,ALPHA,c02,t02
|
|
+ VMAD t03,ALPHA,c03,t03
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ VST t02,8*SIZE(CO)
|
|
+ VST t03,12*SIZE(CO)
|
|
+ jmp $Access_C1_16x4
|
|
+
|
|
+$UnAlign_CO_Access_16x4:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c04, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c05, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c04,c00
|
|
+ VLD_UL c02, 2*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c06, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c01,c05,c01
|
|
+ VLD_UL c03, 3*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c07, 4*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c02,c06,c02
|
|
+ vbisw c03,c07,c03
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+
|
|
+ VMAD t02,ALPHA,c02,t02
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VMAD t03,ALPHA,c03,t03
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_16x4:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_C1_Access_16x4
|
|
+
|
|
+$Align_C1_Access_16x4:
|
|
+ VLD c04,0(C1)
|
|
+ VLD c05,4*SIZE(C1)
|
|
+ VLD c06,8*SIZE(C1)
|
|
+ VLD c07,12*SIZE(C1)
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VMAD t05,ALPHA,c05,t05
|
|
+ VMAD t06,ALPHA,c06,t06
|
|
+ VMAD t07,ALPHA,c07,t07
|
|
+
|
|
+ VST t04,0(C1)
|
|
+ VST t05,4*SIZE(C1)
|
|
+ VST t06,8*SIZE(C1)
|
|
+ VST t07,12*SIZE(C1)
|
|
+ jmp $Access_C2_16x4
|
|
+
|
|
+$UnAlign_C1_Access_16x4:
|
|
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH t00, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VLD_UL c05, 1*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH t01, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ vbisw c04,t00,c04
|
|
+ VLD_UL c06, 2*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH t02, 3*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ vbisw c05,t01,c05
|
|
+ VLD_UL c07, 3*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH t03, 4*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ vbisw c06,t02,c06
|
|
+ vbisw c07,t03,c07
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VMAD t05,ALPHA,c05,t05
|
|
+
|
|
+ VMAD t06,ALPHA,c06,t06
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VMAD t07,ALPHA,c07,t07
|
|
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t06, 2*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t06, 3*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t07, 3*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t07, 4*VEC_LEN*SIZE(C1)
|
|
+
|
|
+
|
|
+$Access_C2_16x4:
|
|
+ and C2, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_C2_Access_16x4
|
|
+
|
|
+ $Align_C2_Access_16x4:
|
|
+ VLD c08,0(C2)
|
|
+ VLD c09,4*SIZE(C2)
|
|
+ VLD c10,8*SIZE(C2)
|
|
+ VLD c11,12*SIZE(C2)
|
|
+
|
|
+ VMAD t08,ALPHA,c08,t08
|
|
+ VMAD t09,ALPHA,c09,t09
|
|
+ VMAD t10,ALPHA,c10,t10
|
|
+ VMAD t11,ALPHA,c11,t11
|
|
+
|
|
+ VST t08,0(C2)
|
|
+ VST t09,4*SIZE(C2)
|
|
+ VST t10,8*SIZE(C2)
|
|
+ VST t11,12*SIZE(C2)
|
|
+ jmp $Access_C3_16x4
|
|
+
|
|
+$UnAlign_C2_Access_16x4:
|
|
+ VLD_UL c08, 0*VEC_LEN*SIZE(C2)
|
|
+ VLD_UH t00, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VLD_UL c09, 1*VEC_LEN*SIZE(C2)
|
|
+ VLD_UH t01, 2*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ vbisw c08,t00,c08
|
|
+ VLD_UL c10, 2*VEC_LEN*SIZE(C2)
|
|
+ VLD_UH t02, 3*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ vbisw c09,t01,c09
|
|
+ VLD_UL c11, 3*VEC_LEN*SIZE(C2)
|
|
+ VLD_UH t03, 4*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ vbisw c10,t02,c10
|
|
+ vbisw c11,t03,c11
|
|
+
|
|
+ VMAD t08,ALPHA,c08,t08
|
|
+ VMAD t09,ALPHA,c09,t09
|
|
+
|
|
+ VMAD t10,ALPHA,c10,t10
|
|
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VMAD t11,ALPHA,c11,t11
|
|
+ VST_UL t09, 1*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t09, 2*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VST_UL t10, 2*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t10, 3*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VST_UL t11, 3*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t11, 4*VEC_LEN*SIZE(C2)
|
|
+
|
|
+
|
|
+$Access_C3_16x4:
|
|
+ and C3, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_C3_Access_16x4
|
|
+
|
|
+$Align_C3_Access_16x4:
|
|
+ VLD c12,0(C3)
|
|
+ VLD c13,4*SIZE(C3)
|
|
+ VLD c14,8*SIZE(C3)
|
|
+ VLD c15,12*SIZE(C3)
|
|
+
|
|
+ VMAD t12,ALPHA,c12,t12
|
|
+ VMAD t13,ALPHA,c13,t13
|
|
+ VMAD t14,ALPHA,c14,t14
|
|
+ VMAD t15,ALPHA,c15,t15
|
|
+
|
|
+ VST t12,0(C3)
|
|
+ VST t13,4*SIZE(C3)
|
|
+ VST t14,8*SIZE(C3)
|
|
+ VST t15,12*SIZE(C3)
|
|
+ jmp $End_NC_Unroll4
|
|
+
|
|
+$UnAlign_C3_Access_16x4:
|
|
+ VLD_UL c12, 0*VEC_LEN*SIZE(C3)
|
|
+ VLD_UH t04, 1*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VLD_UL c13, 1*VEC_LEN*SIZE(C3)
|
|
+ VLD_UH t05, 2*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ vbisw c12,t04,c12
|
|
+ VLD_UL c14, 2*VEC_LEN*SIZE(C3)
|
|
+ VLD_UH t06, 3*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ vbisw c13,t05,c13
|
|
+ VLD_UL c15, 3*VEC_LEN*SIZE(C3)
|
|
+ VLD_UH t07, 4*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ vbisw c14,t06,c14
|
|
+ vbisw c15,t07,c15
|
|
+
|
|
+ VMAD t12,ALPHA,c12,t12
|
|
+ VMAD t13,ALPHA,c13,t13
|
|
+
|
|
+ VMAD t14,ALPHA,c14,t14
|
|
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VMAD t15,ALPHA,c15,t15
|
|
+ VST_UL t13, 1*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t13, 2*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VST_UL t14, 2*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t14, 3*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VST_UL t15, 3*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t15, 4*VEC_LEN*SIZE(C3)
|
|
+ jmp $End_NC_Unroll4
|
|
+
|
|
+#else
|
|
+ and CO, (VEC_LEN*SIZE-1),$6 ### trmm part ###
|
|
+ bne $6,$UnAlign_CO_Access_16x4
|
|
+
|
|
+$Align_CO_Access_16x4:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+ VMUL t02,ALPHA,t02
|
|
+ VMUL t03,ALPHA,t03
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ VST t02,8*SIZE(CO)
|
|
+ VST t03,12*SIZE(CO)
|
|
+ jmp $Access_C1_16x4
|
|
+
|
|
+$UnAlign_CO_Access_16x4:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+
|
|
+ VMUL t02,ALPHA,t02
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VMUL t03,ALPHA,t03
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_16x4:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_C1_Access_16x4
|
|
+
|
|
+$Align_C1_Access_16x4:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VMUL t05,ALPHA,t05
|
|
+ VMUL t06,ALPHA,t06
|
|
+ VMUL t07,ALPHA,t07
|
|
+
|
|
+ VST t04,0(C1)
|
|
+ VST t05,4*SIZE(C1)
|
|
+ VST t06,8*SIZE(C1)
|
|
+ VST t07,12*SIZE(C1)
|
|
+ jmp $Access_C2_16x4
|
|
+
|
|
+$UnAlign_C1_Access_16x4:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VMUL t05,ALPHA,t05
|
|
+
|
|
+ VMUL t06,ALPHA,t06
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VMUL t07,ALPHA,t07
|
|
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t06, 2*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t06, 3*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t07, 3*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t07, 4*VEC_LEN*SIZE(C1)
|
|
+
|
|
+
|
|
+$Access_C2_16x4:
|
|
+ and C2, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_C2_Access_16x4
|
|
+
|
|
+$Align_C2_Access_16x4:
|
|
+ VMUL t08,ALPHA,t08
|
|
+ VMUL t09,ALPHA,t09
|
|
+ VMUL t10,ALPHA,t10
|
|
+ VMUL t11,ALPHA,t11
|
|
+
|
|
+ VST t08,0(C2)
|
|
+ VST t09,4*SIZE(C2)
|
|
+ VST t10,8*SIZE(C2)
|
|
+ VST t11,12*SIZE(C2)
|
|
+ jmp $Access_C3_16x4
|
|
+
|
|
+$UnAlign_C2_Access_16x4:
|
|
+ VMUL t08,ALPHA,t08
|
|
+ VMUL t09,ALPHA,t09
|
|
+
|
|
+ VMUL t10,ALPHA,t10
|
|
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VMUL t11,ALPHA,t11
|
|
+ VST_UL t09, 1*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t09, 2*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VST_UL t10, 2*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t10, 3*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VST_UL t11, 3*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t11, 4*VEC_LEN*SIZE(C2)
|
|
+
|
|
+
|
|
+$Access_C3_16x4:
|
|
+ and C3, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_C3_Access_16x4
|
|
+
|
|
+$Align_C3_Access_16x4:
|
|
+ VMUL t12,ALPHA,t12
|
|
+ VMUL t13,ALPHA,t13
|
|
+ VMUL t14,ALPHA,t14
|
|
+ VMUL t15,ALPHA,t15
|
|
+
|
|
+ VST t12,0(C3)
|
|
+ VST t13,4*SIZE(C3)
|
|
+ VST t14,8*SIZE(C3)
|
|
+ VST t15,12*SIZE(C3)
|
|
+ jmp $TRMMKERNEL_16x4
|
|
+
|
|
+$UnAlign_C3_Access_16x4:
|
|
+ VMUL t12,ALPHA,t12
|
|
+ VMUL t13,ALPHA,t13
|
|
+
|
|
+ VMUL t14,ALPHA,t14
|
|
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VMUL t15,ALPHA,t15
|
|
+ VST_UL t13, 1*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t13, 2*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VST_UL t14, 2*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t14, 3*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VST_UL t15, 3*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t15, 4*VEC_LEN*SIZE(C3)
|
|
+
|
|
+
|
|
+$TRMMKERNEL_16x4:
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP # nodata length
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 16, TEMP # mr=16
|
|
+#else
|
|
+ subl TEMP, 4, TEMP # nr=4
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 4 + BASE_SHIFT,KC # mr=16
|
|
+ sll TEMP, 2 + BASE_SHIFT,TEMP # nr=4
|
|
+
|
|
+ addl A, KC, A # mov A to the end of this panel
|
|
+ addl B, TEMP,B # mov B to the end of this panel
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 16 ,KK
|
|
+#endif
|
|
+ nop
|
|
+ jmp $End_NC_Unroll4
|
|
+#endif
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+.L15: # n=4,m=8-----------------------------
|
|
+ and MC1,8,MC
|
|
+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc
|
|
+ nop
|
|
+ beq MC,.L16
|
|
+
|
|
+ addl A1,SPANA,PREA
|
|
+ subl PREA,8*SIZE,PREA # PREA-=MC
|
|
+
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA))\
|
|
+ || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B # set B
|
|
+ nop
|
|
+#else
|
|
+ sll KK, 3 + BASE_SHIFT,KC # mr=8
|
|
+ sll KK, 2 + BASE_SHIFT,TEMP # nr=4
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # clear (32 results)
|
|
+ vcpys $f31,$f31,t01
|
|
+ vcpys $f31,$f31,t04
|
|
+ vcpys $f31,$f31,t05
|
|
+
|
|
+ LDDE b0,0(B)
|
|
+ LDDE b1,1*SIZE(B)
|
|
+ LDDE b2,2*SIZE(B)
|
|
+ LDDE b3,3*SIZE(B)
|
|
+
|
|
+ vcpys $f31,$f31,t08
|
|
+ vcpys $f31,$f31,t09
|
|
+ vcpys $f31,$f31,t12
|
|
+ vcpys $f31,$f31,t13
|
|
+
|
|
+ VLD a0,0(A) # get 8 A
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+ fillcs 4*SIZE(CO) #
|
|
+ fillcs 4*SIZE(C1)
|
|
+ fillcs 4*SIZE(C2)
|
|
+ fillcs 4*SIZE(C3)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP # temp is the length of the data part
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 8, TEMP # mr=8
|
|
+#else
|
|
+ addl KK, 4, TEMP # nr=4
|
|
+#endif
|
|
+ sra TEMP,1, KC # kc/2
|
|
+ beq KC,$Rest_8x4x1
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1,B # Reset B
|
|
+ sra KC1,1,KC # unroll kc as 2, kc=kc1/2
|
|
+ vcpys $f31,$f31,t00 # clear (32 results)
|
|
+ vcpys $f31,$f31,t01
|
|
+ vcpys $f31,$f31,t04
|
|
+ vcpys $f31,$f31,t05
|
|
+
|
|
+ LDDE b0,0(B)
|
|
+ LDDE b1,1*SIZE(B)
|
|
+ LDDE b2,2*SIZE(B)
|
|
+ LDDE b3,3*SIZE(B)
|
|
+
|
|
+ vcpys $f31,$f31,t08
|
|
+ vcpys $f31,$f31,t09
|
|
+ vcpys $f31,$f31,t12
|
|
+ vcpys $f31,$f31,t13
|
|
+
|
|
+ VLD a0,0(A) # get 8 A
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+ fillcs 4*SIZE(CO) #
|
|
+ fillcs 4*SIZE(C1)
|
|
+ fillcs 4*SIZE(C2)
|
|
+ fillcs 4*SIZE(C3)
|
|
+
|
|
+ beq KC,$Rest_8x4x1
|
|
+#endif
|
|
+
|
|
+ .align 5
|
|
+
|
|
+$Panel_8x4x2:
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+ VMAD a0,b2,t08,t08
|
|
+ VMAD a0,b3,t12,t12
|
|
+
|
|
+ LDDE nb0,4*SIZE(B) # get next 4b
|
|
+ LDDE nb1,5*SIZE(B)
|
|
+ LDDE nb2,6*SIZE(B)
|
|
+ LDDE nb3,7*SIZE(B)
|
|
+
|
|
+ addl B,8*SIZE,B # 4n*2k
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VMAD a4,b1,t05,t05
|
|
+ VMAD a4,b2,t09,t09
|
|
+ VMAD a4,b3,t13,t13
|
|
+
|
|
+ VLD na8,8*SIZE(A) # get next 8a
|
|
+ VLD na12,12*SIZE(A)
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 4*SIZE(PREA)
|
|
+ subl PREA,8*SIZE,PREA # prea -= 8
|
|
+
|
|
+ subl KC,1,KC
|
|
+ addl A,16*SIZE,A # ### next k ###8m*2k
|
|
+ VMAD na8,nb0,t00,t00
|
|
+ VMAD na8,nb1,t04,t04
|
|
+ VMAD na8,nb2,t08,t08
|
|
+ VMAD na8,nb3,t12,t12
|
|
+
|
|
+ LDDE b0,0(B) # get 3rd 4b
|
|
+ LDDE b1,1*SIZE(B)
|
|
+ LDDE b2,2*SIZE(B)
|
|
+ LDDE b3,3*SIZE(B)
|
|
+
|
|
+ VMAD na12,nb0,t01,t01
|
|
+ VMAD na12,nb1,t05,t05
|
|
+ VMAD na12,nb2,t09,t09
|
|
+ VMAD na12,nb3,t13,t13
|
|
+
|
|
+ VLD a0,0(A) # get 3rd 8a
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 4*SIZE(PREA)
|
|
+ subl PREA,8*SIZE,PREA # prea -= mc
|
|
+ bne KC,$Panel_8x4x2 # loop k--
|
|
+
|
|
+$Rest_8x4x1:
|
|
+ LDDE ALPHA, 192($sp) # get alpha
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1, $Write_8x4
|
|
+#else
|
|
+ blbc TEMP, $Write_8x4
|
|
+#endif
|
|
+
|
|
+ addl A,8*SIZE,A # 8a*1k
|
|
+ addl B,4*SIZE,B # 4b*1K
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+ VMAD a0,b2,t08,t08
|
|
+ VMAD a0,b3,t12,t12
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 4*SIZE(PREA)
|
|
+ subl PREA,8*SIZE,PREA
|
|
+
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VMAD a4,b1,t05,t05
|
|
+ VMAD a4,b2,t09,t09
|
|
+ VMAD a4,b3,t13,t13
|
|
+
|
|
+$Write_8x4:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_8x4
|
|
+
|
|
+$Align_CO_Access_8x4:
|
|
+ VLD c00,0(CO) # get 1st colum of 16c
|
|
+ VLD c01,4*SIZE(CO)
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ jmp $Access_C1_8x4
|
|
+
|
|
+$UnAlign_CO_Access_8x4:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c02, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c03, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c02,c00
|
|
+ vbisw c01,c03,c01
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_8x4:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ addl CO,8*SIZE,CO
|
|
+ nop
|
|
+ bne $6,$UnAlign_C1_Access_8x4
|
|
+
|
|
+$Align_C1_Access_8x4:
|
|
+ VLD c04,0(C1)
|
|
+ VLD c05,4*SIZE(C1)
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VMAD t05,ALPHA,c05,t05
|
|
+
|
|
+ VST t04,0(C1)
|
|
+ VST t05,4*SIZE(C1)
|
|
+ jmp $Access_C2_8x4
|
|
+
|
|
+$UnAlign_C1_Access_8x4:
|
|
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH c06, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VLD_UL c05, 1*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH c07, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ vbisw c04,c06,c04
|
|
+ vbisw c05,c07,c05
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VMAD t05,ALPHA,c05,t05
|
|
+
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+
|
|
+$Access_C2_8x4:
|
|
+ and C2, (VEC_LEN*SIZE-1),$6
|
|
+ addl C1,8*SIZE,C1
|
|
+ nop
|
|
+ bne $6,$UnAlign_C2_Access_8x4
|
|
+
|
|
+$Align_C2_Access_8x4:
|
|
+ VLD c08,0(C2)
|
|
+ VLD c09,4*SIZE(C2)
|
|
+
|
|
+ VMAD t08,ALPHA,c08,t08
|
|
+ VMAD t09,ALPHA,c09,t09
|
|
+
|
|
+ VST t08,0(C2)
|
|
+ VST t09,4*SIZE(C2)
|
|
+ jmp $Access_C3_8x4
|
|
+
|
|
+$UnAlign_C2_Access_8x4:
|
|
+ VLD_UL c08, 0*VEC_LEN*SIZE(C2)
|
|
+ VLD_UH c10, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VLD_UL c09, 1*VEC_LEN*SIZE(C2)
|
|
+ VLD_UH c11, 2*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ vbisw c08,c10,c08
|
|
+ vbisw c09,c11,c09
|
|
+
|
|
+ VMAD t08,ALPHA,c08,t08
|
|
+ VMAD t09,ALPHA,c09,t09
|
|
+
|
|
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VST_UL t09, 1*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t09, 2*VEC_LEN*SIZE(C2)
|
|
+
|
|
+
|
|
+$Access_C3_8x4:
|
|
+ and C3, (VEC_LEN*SIZE-1),$6
|
|
+ addl C2,8*SIZE,C2
|
|
+ nop
|
|
+ bne $6,$UnAlign_C3_Access_8x4
|
|
+
|
|
+$Align_C3_Access_8x4:
|
|
+ VLD c12,0(C3)
|
|
+ VLD c13,4*SIZE(C3)
|
|
+
|
|
+ VMAD t12,ALPHA,c12,t12
|
|
+ VMAD t13,ALPHA,c13,t13
|
|
+
|
|
+ VST t12,0(C3)
|
|
+ VST t13,4*SIZE(C3)
|
|
+ addl C3,8*SIZE,C3
|
|
+ jmp .L16
|
|
+
|
|
+
|
|
+$UnAlign_C3_Access_8x4:
|
|
+ VLD_UL c12, 0*VEC_LEN*SIZE(C3)
|
|
+ VLD_UH c14, 1*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VLD_UL c13, 1*VEC_LEN*SIZE(C3)
|
|
+ VLD_UH c15, 2*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ vbisw c12,c14,c12
|
|
+ vbisw c13,c15,c13
|
|
+
|
|
+ VMAD t12,ALPHA,c12,t12
|
|
+ VMAD t13,ALPHA,c13,t13
|
|
+
|
|
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VST_UL t13, 1*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t13, 2*VEC_LEN*SIZE(C3)
|
|
+ addl C3,8*SIZE,C3
|
|
+
|
|
+#else
|
|
+
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_8x4
|
|
+
|
|
+$Align_CO_Access_8x4:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ jmp $Access_C1_8x4
|
|
+
|
|
+$UnAlign_CO_Access_8x4:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_8x4:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ addl CO,8*SIZE,CO # 8c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C1_Access_8x4
|
|
+
|
|
+$Align_C1_Access_8x4:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VMUL t05,ALPHA,t05
|
|
+
|
|
+ VST t04,0(C1)
|
|
+ VST t05,4*SIZE(C1)
|
|
+ jmp $Access_C2_8x4
|
|
+
|
|
+$UnAlign_C1_Access_8x4:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VMUL t05,ALPHA,t05
|
|
+
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+
|
|
+$Access_C2_8x4:
|
|
+ and C2, (VEC_LEN*SIZE-1),$6
|
|
+ addl C1,8*SIZE,C1 # 8c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C2_Access_8x4
|
|
+
|
|
+$Align_C2_Access_8x4:
|
|
+ VMUL t08,ALPHA,t08
|
|
+ VMUL t09,ALPHA,t09
|
|
+
|
|
+ VST t08,0(C2)
|
|
+ VST t09,4*SIZE(C2)
|
|
+ jmp $Access_C3_8x4
|
|
+
|
|
+$UnAlign_C2_Access_8x4:
|
|
+ VMUL t08,ALPHA,t08
|
|
+ VMUL t09,ALPHA,t09
|
|
+
|
|
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ VST_UL t09, 1*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t09, 2*VEC_LEN*SIZE(C2)
|
|
+
|
|
+
|
|
+$Access_C3_8x4:
|
|
+ and C3, (VEC_LEN*SIZE-1),$6
|
|
+ addl C2,8*SIZE,C2 # 8c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C3_Access_8x4
|
|
+
|
|
+$Align_C3_Access_8x4:
|
|
+ VMUL t12,ALPHA,t12
|
|
+ VMUL t13,ALPHA,t13
|
|
+
|
|
+ VST t12,0(C3)
|
|
+ VST t13,4*SIZE(C3)
|
|
+ addl C3,8*SIZE,C3
|
|
+ jmp $TRMMKERNEL_8x4
|
|
+
|
|
+$UnAlign_C3_Access_8x4:
|
|
+ VMUL t12,ALPHA,t12
|
|
+ VMUL t13,ALPHA,t13
|
|
+
|
|
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ VST_UL t13, 1*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t13, 2*VEC_LEN*SIZE(C3)
|
|
+ addl C3,8*SIZE,C3
|
|
+
|
|
+$TRMMKERNEL_8x4:
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 8,TEMP # mr=8
|
|
+#else
|
|
+ subl TEMP, 4,TEMP # nr=4
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 3 + BASE_SHIFT,KC
|
|
+ sll TEMP, 2 + BASE_SHIFT,TEMP
|
|
+
|
|
+ addl A, KC, A # move A, B to the end of this panel
|
|
+ addl B, TEMP, B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 8, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+.L16:
|
|
+ and MC1,4,MC # nr=4,mr=4----------------------------
|
|
+ sll KC1,2+BASE_SHIFT,SPANA # spana=kc1*mc
|
|
+ nop
|
|
+ beq MC,.L17
|
|
+
|
|
+ addl A1,SPANA,PREA
|
|
+ subl PREA,4*SIZE,PREA # PREA-=MC
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1,B # Set B
|
|
+ nop
|
|
+#else
|
|
+ sll KK, 2 + BASE_SHIFT,KC # mr=nr=4
|
|
+ nop
|
|
+
|
|
+ addl A, KC, A
|
|
+ addl B1,KC, B
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # clear 16 register
|
|
+ vcpys $f31,$f31,t04
|
|
+ vcpys $f31,$f31,t08
|
|
+ vcpys $f31,$f31,t12
|
|
+
|
|
+ LDDE b0,0(B) # get 4b
|
|
+ LDDE b1,1*SIZE(B)
|
|
+ LDDE b2,2*SIZE(B)
|
|
+ LDDE b3,3*SIZE(B)
|
|
+
|
|
+ VLD a0,0(A) # get 4a
|
|
+
|
|
+ fillcs 0(CO) # prefetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#else
|
|
+ addl KK, 4, TEMP
|
|
+#endif
|
|
+ sra TEMP,1,KC
|
|
+ nop
|
|
+ beq KC,$Rest_4x4x1
|
|
+
|
|
+#else
|
|
+ mov B1,B # Reset B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ vcpys $f31,$f31,t00 # clear 16 register
|
|
+ vcpys $f31,$f31,t04
|
|
+ vcpys $f31,$f31,t08
|
|
+ vcpys $f31,$f31,t12
|
|
+
|
|
+ LDDE b0,0(B) # get 4b
|
|
+ LDDE b1,1*SIZE(B)
|
|
+ LDDE b2,2*SIZE(B)
|
|
+ LDDE b3,3*SIZE(B)
|
|
+
|
|
+ VLD a0,0(A) # get 4a
|
|
+
|
|
+ fillcs 0(CO) # prefetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+ beq KC,$Rest_4x4x1
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+$Panel_4x4x2:
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+ VMAD a0,b2,t08,t08
|
|
+ VMAD a0,b3,t12,t12
|
|
+
|
|
+ VLD a4,4*SIZE(A)
|
|
+ LDDE nb0,4*SIZE(B) # get next 4b and 4a
|
|
+ LDDE nb1,5*SIZE(B)
|
|
+ LDDE nb2,6*SIZE(B)
|
|
+ LDDE nb3,7*SIZE(B)
|
|
+ addl B,8*SIZE,B # 4b*2k
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ subl PREA,4*SIZE,PREA
|
|
+
|
|
+ subl KC,1,KC
|
|
+ VMAD a4,nb0,t00,t00
|
|
+ VMAD a4,nb1,t04,t04
|
|
+ VMAD a4,nb2,t08,t08
|
|
+ VMAD a4,nb3,t12,t12
|
|
+
|
|
+ addl A,8*SIZE,A # 4a*2k
|
|
+ LDDE b0,0(B) # get 3rd 4b and 4a
|
|
+ LDDE b1,1*SIZE(B)
|
|
+ LDDE b2,2*SIZE(B)
|
|
+ LDDE b3,3*SIZE(B)
|
|
+ VLD a0,0(A)
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ subl PREA,4*SIZE,PREA
|
|
+ bne KC,$Panel_4x4x2
|
|
+
|
|
+
|
|
+$Rest_4x4x1:
|
|
+ LDDE ALPHA, 192($sp) # Get ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1, $Write_4x4
|
|
+#else
|
|
+ blbc TEMP, $Write_4x4
|
|
+#endif
|
|
+
|
|
+ addl A,4*SIZE,A # 4a*1k
|
|
+ addl B,4*SIZE,B # 4b*1K
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ subl PREA,4*SIZE,PREA
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+ VMAD a0,b2,t08,t08
|
|
+ VMAD a0,b3,t12,t12
|
|
+
|
|
+
|
|
+$Write_4x4:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_4x4
|
|
+
|
|
+$Align_CO_Access_4x4:
|
|
+ VLD c00,0(CO) # get 1st colum of 16c
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VST t00,0(CO)
|
|
+ jmp $Access_C1_4x4
|
|
+
|
|
+$UnAlign_CO_Access_4x4:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c02, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c02,c00
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_4x4:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ addl CO,4*SIZE,CO # 4c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C1_Access_4x4
|
|
+
|
|
+$Align_C1_Access_4x4:
|
|
+ VLD c04,0(C1)
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VST t04,0(C1)
|
|
+ jmp $Access_C2_4x4
|
|
+
|
|
+$UnAlign_C1_Access_4x4:
|
|
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH c06, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ vbisw c04,c06,c04
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+
|
|
+$Access_C2_4x4:
|
|
+ and C2, (VEC_LEN*SIZE-1),$6
|
|
+ addl C1,4*SIZE,C1 # 4c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C2_Access_4x4
|
|
+
|
|
+$Align_C2_Access_4x4:
|
|
+ VLD c08,0(C2)
|
|
+ VMAD t08,ALPHA,c08,t08
|
|
+ VST t08,0(C2)
|
|
+ jmp $Access_C3_4x4
|
|
+
|
|
+$UnAlign_C2_Access_4x4:
|
|
+ VLD_UL c08, 0*VEC_LEN*SIZE(C2)
|
|
+ VLD_UH c10, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+ vbisw c08,c10,c08
|
|
+
|
|
+ VMAD t08,ALPHA,c08,t08
|
|
+
|
|
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+
|
|
+$Access_C3_4x4:
|
|
+ and C3, (VEC_LEN*SIZE-1),$6
|
|
+ addl C2,4*SIZE,C2 # 4c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C3_Access_4x4
|
|
+
|
|
+$Align_C3_Access_4x4:
|
|
+ VLD c12,0(C3)
|
|
+ VMAD t12,ALPHA,c12,t12
|
|
+ VST t12,0(C3)
|
|
+ addl C3,4*SIZE,C3
|
|
+ jmp .L17
|
|
+
|
|
+$UnAlign_C3_Access_4x4:
|
|
+ VLD_UL c12, 0*VEC_LEN*SIZE(C3)
|
|
+ VLD_UH c14, 1*VEC_LEN*SIZE(C3)
|
|
+
|
|
+ vbisw c12,c14,c12
|
|
+
|
|
+ VMAD t12,ALPHA,c12,t12
|
|
+
|
|
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
|
|
+ addl C3,4*SIZE,C3
|
|
+
|
|
+
|
|
+#else
|
|
+
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_4x4
|
|
+
|
|
+$Align_CO_Access_4x4:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VST t00,0(CO)
|
|
+ jmp $Access_C1_4x4
|
|
+
|
|
+$UnAlign_CO_Access_4x4:
|
|
+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_4x4:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ addl CO,4*SIZE,CO # 4c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C1_Access_4x4
|
|
+
|
|
+$Align_C1_Access_4x4:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VST t04,0(C1)
|
|
+ jmp $Access_C2_4x4
|
|
+
|
|
+$UnAlign_C1_Access_4x4:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+
|
|
+$Access_C2_4x4:
|
|
+ and C2, (VEC_LEN*SIZE-1),$6
|
|
+ addl C1,4*SIZE,C1 # 4c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C2_Access_4x4
|
|
+
|
|
+$Align_C2_Access_4x4:
|
|
+ VMUL t08,ALPHA,t08
|
|
+ VST t08,0(C2)
|
|
+ jmp $Access_C3_4x4
|
|
+
|
|
+$UnAlign_C2_Access_4x4:
|
|
+ VMUL t08,ALPHA,t08
|
|
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
|
|
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
|
|
+
|
|
+
|
|
+$Access_C3_4x4:
|
|
+ and C3, (VEC_LEN*SIZE-1),$6
|
|
+ addl C2,4*SIZE,C2 # 4c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C3_Access_4x4
|
|
+
|
|
+$Align_C3_Access_4x4:
|
|
+ VMUL t12,ALPHA,t12
|
|
+ VST t12,0(C3)
|
|
+ addl C3,4*SIZE,C3
|
|
+ jmp $TRMMKERNEL_4x4
|
|
+
|
|
+$UnAlign_C3_Access_4x4:
|
|
+ VMUL t12,ALPHA,t12
|
|
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
|
|
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
|
|
+ addl C3,4*SIZE,C3
|
|
+
|
|
+$TRMMKERNEL_4x4:
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+ subl TEMP, 4, TEMP # mr=nr=4
|
|
+
|
|
+ sll TEMP, 2 + BASE_SHIFT,KC
|
|
+ nop
|
|
+
|
|
+ addl A, KC, A # move A B to the end of this panel
|
|
+ addl B, KC, B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+.L17: # nr=4,mr=2--------------------
|
|
+ and MC1,2,MC
|
|
+ beq MC,.L18
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA))\
|
|
+ || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+#else
|
|
+ sll KK, 1 + BASE_SHIFT, KC # mr=2
|
|
+ sll KK, 2 + BASE_SHIFT, TEMP # nr=4
|
|
+
|
|
+ addl A, KC, A
|
|
+ addl B1,TEMP, B
|
|
+#endif
|
|
+
|
|
+ fclr t00 # CLEAR 8 register
|
|
+ fclr t01
|
|
+ fclr t04
|
|
+ fclr t05
|
|
+ fclr t08
|
|
+ fclr t09
|
|
+ fclr t12
|
|
+ fclr t13
|
|
+
|
|
+ LD b0,0(B) # get 4b
|
|
+ LD b1,1*SIZE(B)
|
|
+ LD a0,0(A) # get 2a
|
|
+ LD b2,2*SIZE(B)
|
|
+ LD b3,3*SIZE(B)
|
|
+ LD a4,1*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # prefetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 2, TEMP # mr=2
|
|
+#else
|
|
+ addl KK, 4, TEMP # nr=4
|
|
+#endif
|
|
+ sra TEMP, 1, KC
|
|
+ beq KC,$Rest_2x4x1
|
|
+
|
|
+#else
|
|
+ mov B1,B # reset B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ fclr t00 # CLEAR 8 register
|
|
+ fclr t01
|
|
+ fclr t04
|
|
+ fclr t05
|
|
+ fclr t08
|
|
+ fclr t09
|
|
+ fclr t12
|
|
+ fclr t13
|
|
+
|
|
+ LD b0,0(B) # get 4b
|
|
+ LD b1,1*SIZE(B)
|
|
+ LD a0,0(A) # get 2a
|
|
+ LD b2,2*SIZE(B)
|
|
+ LD b3,3*SIZE(B)
|
|
+ LD a4,1*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # prefetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+ beq KC,$Rest_2x4x1
|
|
+#endif
|
|
+
|
|
+
|
|
+$Panel_2x4x2:
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a0,b1,t04,t04
|
|
+ MAD a0,b2,t08,t08
|
|
+ MAD a0,b3,t12,t12
|
|
+
|
|
+ LD nb0,4*SIZE(B) # get next 4b and 2a
|
|
+ LD nb1,5*SIZE(B)
|
|
+ LD a8,2*SIZE(A)
|
|
+ LD nb2,6*SIZE(B)
|
|
+ LD nb3,7*SIZE(B)
|
|
+ LD a12,3*SIZE(A)
|
|
+ addl B,8*SIZE,B # 4b*2k
|
|
+
|
|
+ MAD a4,b0,t01,t01
|
|
+ MAD a4,b1,t05,t05
|
|
+ MAD a4,b2,t09,t09
|
|
+ MAD a4,b3,t13,t13
|
|
+
|
|
+ subl KC,1,KC
|
|
+ MAD a8,nb0,t00,t00
|
|
+ MAD a8,nb1,t04,t04
|
|
+ MAD a8,nb2,t08,t08
|
|
+ MAD a8,nb3,t12,t12
|
|
+
|
|
+ addl A,4*SIZE,A # 2a*2k
|
|
+ LD b0,0(B) # get 3rd 4b and 2a
|
|
+ LD b1,1*SIZE(B)
|
|
+ LD a0,0(A)
|
|
+ LD b2,2*SIZE(B)
|
|
+ LD b3,3*SIZE(B)
|
|
+ LD a4,1*SIZE(A)
|
|
+
|
|
+ MAD a12,nb0,t01,t01
|
|
+ MAD a12,nb1,t05,t05
|
|
+ MAD a12,nb2,t09,t09
|
|
+ MAD a12,nb3,t13,t13
|
|
+
|
|
+ bne KC,$Panel_2x4x2
|
|
+
|
|
+
|
|
+$Rest_2x4x1:
|
|
+ LD ALPHA, 192($sp) # get alpha
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1, $Write_2x4
|
|
+#else
|
|
+ blbc TEMP, $Write_2x4
|
|
+#endif
|
|
+
|
|
+ addl A,2*SIZE,A # 2a*1k
|
|
+ addl B,4*SIZE,B # 4b*1K
|
|
+
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a0,b1,t04,t04
|
|
+ MAD a0,b2,t08,t08
|
|
+ MAD a0,b3,t12,t12
|
|
+
|
|
+ MAD a4,b0,t01,t01
|
|
+ MAD a4,b1,t05,t05
|
|
+ MAD a4,b2,t09,t09
|
|
+ MAD a4,b3,t13,t13
|
|
+
|
|
+$Write_2x4:
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c00,0(CO)
|
|
+ LD c01,1*SIZE(CO)
|
|
+ LD c04,0(C1)
|
|
+ LD c05,1*SIZE(C1)
|
|
+
|
|
+ MAD t00,ALPHA,c00,t00
|
|
+ MAD t01,ALPHA,c01,t01
|
|
+
|
|
+ LD c08,0(C2)
|
|
+ LD c09,1*SIZE(C2)
|
|
+
|
|
+ MAD t04,ALPHA,c04,t04
|
|
+ MAD t05,ALPHA,c05,t05
|
|
+
|
|
+ LD c12,0(C3)
|
|
+ LD c13,1*SIZE(C3)
|
|
+
|
|
+ MAD t08,ALPHA,c08,t08
|
|
+ MAD t09,ALPHA,c09,t09
|
|
+
|
|
+ addl CO,2*SIZE,CO # 2c
|
|
+ addl C1,2*SIZE,C1
|
|
+ addl C2,2*SIZE,C2
|
|
+ addl C3,2*SIZE,C3
|
|
+
|
|
+ ST t00,-2*SIZE(CO) # 2c
|
|
+ ST t01,-1*SIZE(CO)
|
|
+
|
|
+ MAD t12,ALPHA,c12,t12
|
|
+ MAD t13,ALPHA,c13,t13
|
|
+
|
|
+ ST t04,-2*SIZE(C1)
|
|
+ ST t05,-1*SIZE(C1)
|
|
+
|
|
+ ST t08,-2*SIZE(C2)
|
|
+ ST t09,-1*SIZE(C2)
|
|
+
|
|
+ ST t12,-2*SIZE(C3)
|
|
+ ST t13,-1*SIZE(C3)
|
|
+
|
|
+#else
|
|
+ MUL t00,ALPHA,t00
|
|
+ MUL t01,ALPHA,t01
|
|
+
|
|
+ MUL t04,ALPHA,t04
|
|
+ MUL t05,ALPHA,t05
|
|
+
|
|
+ MUL t08,ALPHA,t08
|
|
+ MUL t09,ALPHA,t09
|
|
+
|
|
+ addl CO,2*SIZE,CO # 2c
|
|
+ addl C1,2*SIZE,C1
|
|
+ addl C2,2*SIZE,C2
|
|
+ addl C3,2*SIZE,C3
|
|
+
|
|
+ ST t00,-2*SIZE(CO) # 2c
|
|
+ ST t01,-1*SIZE(CO)
|
|
+
|
|
+ MUL t12,ALPHA,t12
|
|
+ MUL t13,ALPHA,t13
|
|
+
|
|
+ ST t04,-2*SIZE(C1)
|
|
+ ST t05,-1*SIZE(C1)
|
|
+
|
|
+ ST t08,-2*SIZE(C2)
|
|
+ ST t09,-1*SIZE(C2)
|
|
+
|
|
+ ST t12,-2*SIZE(C3)
|
|
+ ST t13,-1*SIZE(C3)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 2, TEMP
|
|
+#else
|
|
+ subl TEMP, 4, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 1 + BASE_SHIFT,KC
|
|
+ sll TEMP, 2 + BASE_SHIFT,TEMP
|
|
+
|
|
+ addl A, KC, A
|
|
+ addl B, TEMP, B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK,2,KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+.align 5
|
|
+.L18: # nr=4,mr=1---------------------------
|
|
+ and MC1,1,MC
|
|
+ beq MC,$End_NC_Unroll4
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+ nop
|
|
+#else
|
|
+ sll KK, BASE_SHIFT, KC # mr=1
|
|
+ sll KK, 2 + BASE_SHIFT,TEMP # nr=4
|
|
+
|
|
+ addl A, KC, A
|
|
+ addl B1,TEMP, B
|
|
+#endif
|
|
+
|
|
+ fclr t00 # clear 4 regitster
|
|
+ fclr t04
|
|
+ fclr t08
|
|
+ fclr t12
|
|
+
|
|
+ LD b0,0(B) # get 4b
|
|
+ LD b1,1*SIZE(B)
|
|
+ LD b2,2*SIZE(B)
|
|
+ LD b3,3*SIZE(B)
|
|
+
|
|
+ LD a0,0(A) # get 1 a
|
|
+
|
|
+ fillcs 0(CO) # prefetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 1, TEMP # mr=1
|
|
+#else
|
|
+ addl KK, 4,TEMP # nr=4
|
|
+#endif
|
|
+ sra TEMP,1,KC
|
|
+ beq KC,$Rest_1x4x1
|
|
+
|
|
+#else
|
|
+ mov B1,B # Reset B
|
|
+ fclr t00 # clear 4 regitster
|
|
+ fclr t04
|
|
+ fclr t08
|
|
+ fclr t12
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+
|
|
+ LD b0,0(B) # get 4b
|
|
+ LD b1,1*SIZE(B)
|
|
+ LD b2,2*SIZE(B)
|
|
+ LD b3,3*SIZE(B)
|
|
+
|
|
+ LD a0,0(A) # get 1 a
|
|
+
|
|
+ fillcs 0(CO) # prefetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 0(C2)
|
|
+ fillcs 0(C3)
|
|
+
|
|
+ beq KC,$Rest_1x4x1
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+$Panel_1x4x2:
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a0,b1,t04,t04
|
|
+ MAD a0,b2,t08,t08
|
|
+ MAD a0,b3,t12,t12
|
|
+
|
|
+ LD a8,1*SIZE(A)
|
|
+ LD nb0,4*SIZE(B)
|
|
+ LD nb1,5*SIZE(B)
|
|
+ LD nb2,6*SIZE(B)
|
|
+ LD nb3,7*SIZE(B)
|
|
+
|
|
+ addl B,8*SIZE,B # 4b*2k
|
|
+
|
|
+ subl KC,1,KC
|
|
+ MAD a8,nb0,t00,t00
|
|
+ MAD a8,nb1,t04,t04
|
|
+ MAD a8,nb2,t08,t08
|
|
+ MAD a8,nb3,t12,t12
|
|
+
|
|
+ addl A,2*SIZE,A # 1a*2k
|
|
+ LD a0,0(A) # get 3rd 4b and 1a
|
|
+ LD b0,0(B)
|
|
+ LD b1,1*SIZE(B)
|
|
+ LD b2,2*SIZE(B)
|
|
+ LD b3,3*SIZE(B)
|
|
+ bne KC,$Panel_1x4x2
|
|
+
|
|
+
|
|
+$Rest_1x4x1:
|
|
+ LD ALPHA,192($sp) # get alpha
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1, $Write_1x4
|
|
+#else
|
|
+ blbc TEMP, $Write_1x4
|
|
+#endif
|
|
+
|
|
+ addl A,1*SIZE,A # 1m*1k*8Byte
|
|
+ addl B,4*SIZE,B # 4n*1K*8Byte
|
|
+
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a0,b1,t04,t04
|
|
+ MAD a0,b2,t08,t08
|
|
+ MAD a0,b3,t12,t12
|
|
+
|
|
+
|
|
+$Write_1x4:
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c00,0(CO)
|
|
+ LD c04,0(C1)
|
|
+ MAD t00,ALPHA,c00,t00
|
|
+ MAD t04,ALPHA,c04,t04
|
|
+ LD c08,0(C2)
|
|
+ LD c12,0(C3)
|
|
+ MAD t08,ALPHA,c08,t08
|
|
+ MAD t12,ALPHA,c12,t12
|
|
+ ST t00,0(CO)
|
|
+ ST t04,0(C1)
|
|
+ ST t08,0(C2)
|
|
+ ST t12,0(C3)
|
|
+
|
|
+#else
|
|
+ MUL t00,ALPHA,t00
|
|
+ MUL t04,ALPHA,t04
|
|
+ MUL t08,ALPHA,t08
|
|
+ MUL t12,ALPHA,t12
|
|
+
|
|
+ ST t00,0(CO)
|
|
+ ST t04,0(C1)
|
|
+ ST t08,0(C2)
|
|
+ ST t12,0(C3)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 1, TEMP
|
|
+#else
|
|
+ subl TEMP, 4, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, BASE_SHIFT, KC
|
|
+ sll TEMP, 2 + BASE_SHIFT, TEMP
|
|
+
|
|
+ addl A, KC, A
|
|
+ addl B, TEMP,B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 1,KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+$End_NC_Unroll4:
|
|
+ subl NC,1,NC # Loop N --
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 4, KK
|
|
+ nop
|
|
+#endif
|
|
+ mov A1,A # Reset A
|
|
+ mov B, B1 # mov B1 to the next panel
|
|
+ bne NC,.L0
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+$Begin_NC_Unroll2:
|
|
+
|
|
+ and NC1, 2, NC
|
|
+ beq NC, $Begin_NC_Unroll1
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK # reset KK
|
|
+#endif
|
|
+
|
|
+ mov C,CO
|
|
+ addl C,LDM,C1
|
|
+
|
|
+ sra MC1,4,MC # MC=MC1/16
|
|
+ sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC
|
|
+
|
|
+ addl A1,SPANA,PREA
|
|
+ subl PREA,16*SIZE,PREA
|
|
+
|
|
+ addl C1,LDM,C # C=C1+LDM, Mov C to Next Panel
|
|
+ beq MC,.L25 # MC=0:MC1<16
|
|
+
|
|
+
|
|
+ .align 5
|
|
+.L2: # nr=2,mr=16-------------------
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA))\
|
|
+ || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1,B
|
|
+#else
|
|
+ sll KK, 4 + BASE_SHIFT,KC # mr=16
|
|
+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # CLEAR Results Register
|
|
+ vcpys $f31,$f31,t01
|
|
+ vcpys $f31,$f31,t02
|
|
+ vcpys $f31,$f31,t03
|
|
+
|
|
+ LDDE b0,0(B)
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ VLD a0,0(A) # Get 16 A and 2 B
|
|
+ VLD a4,4*SIZE(A)
|
|
+ VLD a8,8*SIZE(A)
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ vcpys $f31,$f31,t04
|
|
+ vcpys $f31,$f31,t06
|
|
+ vcpys $f31,$f31,t05
|
|
+ vcpys $f31,$f31,t07
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 8*SIZE(CO)
|
|
+ fillcs 8*SIZE(C1)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 16, TEMP # mr=16
|
|
+#else
|
|
+ addl KK, 2, TEMP # nr=2
|
|
+#endif
|
|
+ sra TEMP, 1, KC
|
|
+ nop
|
|
+ beq KC,$Rest_16x2x1
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1,B # Set B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ vcpys $f31,$f31,t00 # CLEAR Results Register
|
|
+ vcpys $f31,$f31,t01
|
|
+ vcpys $f31,$f31,t02
|
|
+ vcpys $f31,$f31,t03
|
|
+
|
|
+ LDDE b0,0(B)
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ VLD a0,0(A) # Get 16 A and 2 B
|
|
+ VLD a4,4*SIZE(A)
|
|
+ VLD a8,8*SIZE(A)
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ vcpys $f31,$f31,t04
|
|
+ vcpys $f31,$f31,t06
|
|
+ vcpys $f31,$f31,t05
|
|
+ vcpys $f31,$f31,t07
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 8*SIZE(CO)
|
|
+ fillcs 8*SIZE(C1)
|
|
+
|
|
+ beq KC,$Rest_16x2x1
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+$Panel_16x2x2:
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+
|
|
+ addl A,16*SIZE,A # 16m*1k
|
|
+ LDDE nb0,2*SIZE(B)
|
|
+ LDDE nb1,3*SIZE(B)
|
|
+
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VMAD a4,b1,t05,t05
|
|
+
|
|
+ addl B,4*SIZE,B # 2n*2k
|
|
+ VLD na0,0(A)
|
|
+ VLD na4,4*SIZE(A)
|
|
+ VLD na8,8*SIZE(A)
|
|
+ VLD na12,12*SIZE(A)
|
|
+
|
|
+ VMAD a8,b0,t02,t02
|
|
+ VMAD a8,b1,t06,t06
|
|
+
|
|
+ VMAD a12,b0,t03,t03
|
|
+ VMAD a12,b1,t07,t07
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 8*SIZE(PREA)
|
|
+ subl PREA,16*SIZE,PREA
|
|
+
|
|
+ subl KC,1,KC
|
|
+ VMAD na0,nb0,t00,t00
|
|
+ VMAD na0,nb1,t04,t04
|
|
+
|
|
+ addl A,16*SIZE,A # 16m*1k
|
|
+ LDDE b0,0(B)
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ VMAD na4,nb0,t01,t01
|
|
+ VMAD na4,nb1,t05,t05
|
|
+
|
|
+ VLD a0,0(A) # get 3rd 16a
|
|
+ VLD a4,4*SIZE(A)
|
|
+ VLD a8,8*SIZE(A)
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ VMAD na8,nb0,t02,t02
|
|
+ VMAD na8,nb1,t06,t06
|
|
+
|
|
+ VMAD na12,nb0,t03,t03
|
|
+ VMAD na12,nb1,t07,t07
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 8*SIZE(PREA)
|
|
+ subl PREA,16*SIZE,PREA
|
|
+ bne KC,$Panel_16x2x2
|
|
+
|
|
+
|
|
+$Rest_16x2x1:
|
|
+ LDDE ALPHA, 192($sp) # get alpha
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1, $Write_16x2
|
|
+#else
|
|
+ blbc TEMP, $Write_16x2
|
|
+#endif
|
|
+
|
|
+ addl A,16*SIZE,A # 16m*1k
|
|
+ addl B,2*SIZE,B # 2n*1k
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 8*SIZE(PREA)
|
|
+ subl PREA,16*SIZE,PREA
|
|
+
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VMAD a4,b1,t05,t05
|
|
+ VMAD a8,b0,t02,t02
|
|
+ VMAD a8,b1,t06,t06
|
|
+ VMAD a12,b0,t03,t03
|
|
+ VMAD a12,b1,t07,t07
|
|
+
|
|
+
|
|
+$Write_16x2:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_16x2
|
|
+
|
|
+$Align_CO_Access_16x2:
|
|
+ VLD c00,0(CO) # get 1st colum of 16c
|
|
+ VLD c01,4*SIZE(CO)
|
|
+ VLD c02,8*SIZE(CO)
|
|
+ VLD c03,12*SIZE(CO)
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+ VMAD t02,ALPHA,c02,t02
|
|
+ VMAD t03,ALPHA,c03,t03
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ VST t02,8*SIZE(CO)
|
|
+ VST t03,12*SIZE(CO)
|
|
+ jmp $Access_C1_16x2
|
|
+
|
|
+$UnAlign_CO_Access_16x2:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c04, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c05, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c02, 2*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c06, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c03, 3*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c07, 4*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c04,c00
|
|
+ vbisw c01,c05,c01
|
|
+ vbisw c02,c06,c02
|
|
+ vbisw c03,c07,c03
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+ VMAD t02,ALPHA,c02,t02
|
|
+ VMAD t03,ALPHA,c03,t03
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_16x2:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_C1_Access_16x2
|
|
+
|
|
+$Align_C1_Access_16x2:
|
|
+ VLD c04,0(C1)
|
|
+ VLD c05,4*SIZE(C1)
|
|
+ VLD c06,8*SIZE(C1)
|
|
+ VLD c07,12*SIZE(C1)
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VMAD t05,ALPHA,c05,t05
|
|
+ VMAD t06,ALPHA,c06,t06
|
|
+ VMAD t07,ALPHA,c07,t07
|
|
+
|
|
+ VST t04,0(C1)
|
|
+ VST t05,4*SIZE(C1)
|
|
+ VST t06,8*SIZE(C1)
|
|
+ VST t07,12*SIZE(C1)
|
|
+ jmp $End_NC_Unroll2
|
|
+
|
|
+$UnAlign_C1_Access_16x2:
|
|
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH t00, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VLD_UL c05, 1*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH t01, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VLD_UL c06, 2*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH t02, 3*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VLD_UL c07, 3*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH t03, 4*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ vbisw c04,t00,c04
|
|
+ vbisw c05,t01,c05
|
|
+ vbisw c06,t02,c06
|
|
+ vbisw c07,t03,c07
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VMAD t05,ALPHA,c05,t05
|
|
+ VMAD t06,ALPHA,c06,t06
|
|
+ VMAD t07,ALPHA,c07,t07
|
|
+
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t06, 2*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t06, 3*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t07, 3*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t07, 4*VEC_LEN*SIZE(C1)
|
|
+ jmp $End_NC_Unroll2 # loop m finished
|
|
+
|
|
+
|
|
+#else
|
|
+
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_16x2
|
|
+
|
|
+$Align_CO_Access_16x2:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+ VMUL t02,ALPHA,t02
|
|
+ VMUL t03,ALPHA,t03
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ VST t02,8*SIZE(CO)
|
|
+ VST t03,12*SIZE(CO)
|
|
+ jmp $Access_C1_16x2
|
|
+
|
|
+$UnAlign_CO_Access_16x2:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+ VMUL t02,ALPHA,t02
|
|
+ VMUL t03,ALPHA,t03
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_16x2:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_C1_Access_16x2
|
|
+
|
|
+$Align_C1_Access_16x2:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VMUL t05,ALPHA,t05
|
|
+ VMUL t06,ALPHA,t06
|
|
+ VMUL t07,ALPHA,t07
|
|
+
|
|
+ VST t04,0(C1)
|
|
+ VST t05,4*SIZE(C1)
|
|
+ VST t06,8*SIZE(C1)
|
|
+ VST t07,12*SIZE(C1)
|
|
+ jmp $TRMMKERNEL_16x2
|
|
+
|
|
+$UnAlign_C1_Access_16x2:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VMUL t05,ALPHA,t05
|
|
+ VMUL t06,ALPHA,t06
|
|
+ VMUL t07,ALPHA,t07
|
|
+
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t06, 2*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t06, 3*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t07, 3*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t07, 4*VEC_LEN*SIZE(C1)
|
|
+
|
|
+$TRMMKERNEL_16x2:
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 16, TEMP
|
|
+#else
|
|
+ subl TEMP, 2, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 4 + BASE_SHIFT,KC
|
|
+ sll TEMP, 1 + BASE_SHIFT,TEMP
|
|
+
|
|
+ addl A, KC, A
|
|
+ addl B, TEMP,B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 16, KK
|
|
+ nop
|
|
+#endif
|
|
+
|
|
+ jmp $End_NC_Unroll2 # loop m finished
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+.L25:
|
|
+ and MC1,8,MC
|
|
+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc
|
|
+ nop
|
|
+ beq MC,.L26
|
|
+
|
|
+ addl A1,SPANA,PREA
|
|
+ subl PREA,8*SIZE,PREA # PREA-=MC
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA))\
|
|
+ || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+#else
|
|
+ sll KK, 3 + BASE_SHIFT,KC # mr=8
|
|
+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2
|
|
+
|
|
+ addl A,KC, A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # clear 16 registers
|
|
+ vcpys $f31,$f31,t01
|
|
+
|
|
+ LDDE b0,0(B) # Get 2b
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ vcpys $f31,$f31,t04
|
|
+ vcpys $f31,$f31,t05
|
|
+
|
|
+ VLD a0,0(A) # Get 8a
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(CO)
|
|
+ fillcs 4*SIZE(C1)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 8, TEMP # mr=8
|
|
+#else
|
|
+ addl KK, 2, TEMP # nr=2
|
|
+#endif
|
|
+ sra TEMP, 1,KC
|
|
+ nop
|
|
+ beq KC,$Rest_8x2x1
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1, B
|
|
+ sra KC1,1,KC
|
|
+ vcpys $f31,$f31,t00 # clear 16 registers
|
|
+ vcpys $f31,$f31,t01
|
|
+
|
|
+ LDDE b0,0(B) # Get 2b
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ vcpys $f31,$f31,t04
|
|
+ vcpys $f31,$f31,t05
|
|
+
|
|
+ VLD a0,0(A) # Get 8a
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(CO)
|
|
+ fillcs 4*SIZE(C1)
|
|
+
|
|
+ beq KC,$Rest_8x2x1
|
|
+#endif
|
|
+
|
|
+
|
|
+$Panel_8x2x2:
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+
|
|
+ LDDE nb0,2*SIZE(B) # get next 2b
|
|
+ LDDE nb1,3*SIZE(B)
|
|
+
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VMAD a4,b1,t05,t05
|
|
+
|
|
+ addl B,4*SIZE,B # 2n*2k
|
|
+ VLD na8,8*SIZE(A) # get next 8a
|
|
+ VLD na12,12*SIZE(A)
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 4*SIZE(PREA)
|
|
+ subl PREA,8*SIZE,PREA
|
|
+
|
|
+ subl KC,1,KC
|
|
+ VMAD na8,nb0,t00,t00
|
|
+ VMAD na8,nb1,t04,t04
|
|
+
|
|
+ addl A,16*SIZE,A # 8m*2k
|
|
+ LDDE b0,0(B)
|
|
+ LDDE b1,1*SIZE(B) # get 3rd 2b
|
|
+
|
|
+ VMAD na12,nb0,t01,t01
|
|
+ VMAD na12,nb1,t05,t05
|
|
+
|
|
+ VLD a0,0(A) # get 3rd 8a
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 4*SIZE(PREA)
|
|
+ subl PREA,8*SIZE,PREA
|
|
+ bne KC,$Panel_8x2x2
|
|
+
|
|
+
|
|
+$Rest_8x2x1:
|
|
+ LDDE ALPHA,192($sp) # get alpha
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,$Write_8x2
|
|
+#else
|
|
+ blbc TEMP,$Write_8x2
|
|
+#endif
|
|
+
|
|
+ addl A,8*SIZE,A # 8m*1k
|
|
+ addl B,2*SIZE,B # 2n*1K
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ fillcs 4*SIZE(PREA)
|
|
+ subl PREA,8*SIZE,PREA
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VMAD a4,b1,t05,t05
|
|
+
|
|
+
|
|
+$Write_8x2:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_8x2
|
|
+
|
|
+$Align_CO_Access_8x2:
|
|
+ VLD c00,0(CO) # get 1st colum of 16c
|
|
+ VLD c01,4*SIZE(CO)
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ jmp $Access_C1_8x2
|
|
+
|
|
+$UnAlign_CO_Access_8x2:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c02, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c03, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c02,c00
|
|
+ vbisw c01,c03,c01
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_8x2:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ addl CO,8*SIZE,CO # 8c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C1_Access_8x2
|
|
+
|
|
+$Align_C1_Access_8x2:
|
|
+ VLD c04,0(C1)
|
|
+ VLD c05,4*SIZE(C1)
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VMAD t05,ALPHA,c05,t05
|
|
+
|
|
+ VST t04,0(C1)
|
|
+ VST t05,4*SIZE(C1)
|
|
+ addl C1,8*SIZE,C1
|
|
+ jmp .L26
|
|
+
|
|
+$UnAlign_C1_Access_8x2:
|
|
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH c06, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VLD_UL c05, 1*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH c07, 2*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ vbisw c04,c06,c04
|
|
+ vbisw c05,c07,c05
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VMAD t05,ALPHA,c05,t05
|
|
+
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
|
|
+ addl C1,8*SIZE,C1
|
|
+
|
|
+#else
|
|
+
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_8x2
|
|
+
|
|
+$Align_CO_Access_8x2:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ jmp $Access_C1_8x2
|
|
+
|
|
+$UnAlign_CO_Access_8x2:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_8x2:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ addl CO,8*SIZE,CO # 8c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C1_Access_8x2
|
|
+
|
|
+$Align_C1_Access_8x2:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VMUL t05,ALPHA,t05
|
|
+
|
|
+ VST t04,0(C1)
|
|
+ VST t05,4*SIZE(C1)
|
|
+ addl C1,8*SIZE,C1
|
|
+ jmp $TRMMKERNEL_8x2
|
|
+
|
|
+$UnAlign_C1_Access_8x2:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VMUL t05,ALPHA,t05
|
|
+
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
|
|
+ addl C1,8*SIZE,C1
|
|
+
|
|
+$TRMMKERNEL_8x2:
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK,TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 8,TEMP # mr=8
|
|
+#else
|
|
+ subl TEMP, 2,TEMP # nr=2
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 3 + BASE_SHIFT,KC
|
|
+ sll TEMP, 1 + BASE_SHIFT,TEMP
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B,TEMP,B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK,8,KK
|
|
+ nop
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+.L26: # nr=2,mr=4------------------
|
|
+ and MC1,4,MC # MC1&4
|
|
+ beq MC,.L27
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+ nop
|
|
+#else
|
|
+ sll KK, 2 + BASE_SHIFT,KC # mr=4
|
|
+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # clear 2vector registers
|
|
+ vcpys $f31,$f31,t04
|
|
+
|
|
+ LDDE b0,0(B) # get 2b
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ VLD a0,0(A) # Get 4 a
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 4, TEMP
|
|
+#else
|
|
+ addl KK, 2, TEMP
|
|
+#endif
|
|
+ sra TEMP,1,KC
|
|
+ beq KC,$Rest_4x2x1
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1,B
|
|
+ sra KC1,1,KC
|
|
+ vcpys $f31,$f31,t00 # clear 2vector registers
|
|
+ vcpys $f31,$f31,t04
|
|
+
|
|
+ LDDE b0,0(B) # get 2b
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ VLD a0,0(A) # Get 4 a
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+
|
|
+ beq KC,$Rest_4x2x1
|
|
+#endif
|
|
+
|
|
+$Panel_4x2x2:
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+
|
|
+ LDDE nb0,2*SIZE(B) # get next 2b
|
|
+ LDDE nb1,3*SIZE(B)
|
|
+
|
|
+ addl B,4*SIZE,B # 2n*2K
|
|
+ VLD a4,4*SIZE(A) # get next 4a
|
|
+
|
|
+ subl KC,1,KC
|
|
+ VMAD a4,nb0,t00,t00
|
|
+ VMAD a4,nb1,t04,t04
|
|
+
|
|
+ addl A,8*SIZE,A # 4m*2k
|
|
+ LDDE b0,0(B) # get 3rd 2b
|
|
+ LDDE b1,1*SIZE(B)
|
|
+
|
|
+ VLD a0,0(A) # get 3rd 4a
|
|
+ bne KC,$Panel_4x2x2
|
|
+
|
|
+
|
|
+$Rest_4x2x1:
|
|
+ LDDE ALPHA,192($sp) # get alpha
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,$Write_4x2
|
|
+#else
|
|
+ blbc TEMP,$Write_4x2
|
|
+#endif
|
|
+
|
|
+ addl A,4*SIZE,A # 4m*1k
|
|
+ addl B,2*SIZE,B # 2n*1K
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a0,b1,t04,t04
|
|
+
|
|
+
|
|
+$Write_4x2:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_4x2
|
|
+
|
|
+$Align_CO_Access_4x2:
|
|
+ VLD c00,0(CO) # get 1st colum of 16c
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VST t00,0(CO)
|
|
+ jmp $Access_C1_4x2
|
|
+
|
|
+$UnAlign_CO_Access_4x2:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c01, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c01,c00
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_4x2:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ addl CO,4*SIZE,CO # 4c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C1_Access_4x2
|
|
+
|
|
+$Align_C1_Access_4x2:
|
|
+ VLD c04,0(C1)
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+ VST t04,0(C1)
|
|
+ addl C1,4*SIZE,C1
|
|
+ jmp .L27
|
|
+
|
|
+$UnAlign_C1_Access_4x2:
|
|
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
|
|
+ VLD_UH c05, 1*VEC_LEN*SIZE(C1)
|
|
+
|
|
+ vbisw c04,c05,c04
|
|
+
|
|
+ VMAD t04,ALPHA,c04,t04
|
|
+
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+ addl C1,4*SIZE,C1
|
|
+
|
|
+#else
|
|
+
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_4x2
|
|
+
|
|
+$Align_CO_Access_4x2:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VST t00,0(CO)
|
|
+ jmp $Access_C1_4x2
|
|
+
|
|
+$UnAlign_CO_Access_4x2:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+
|
|
+$Access_C1_4x2:
|
|
+ and C1, (VEC_LEN*SIZE-1),$6
|
|
+ addl CO,4*SIZE,CO # 4c
|
|
+ nop
|
|
+ bne $6,$UnAlign_C1_Access_4x2
|
|
+
|
|
+$Align_C1_Access_4x2:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VST t04,0(C1)
|
|
+ addl C1,4*SIZE,C1
|
|
+ jmp $TRMMKERNEL_4x2
|
|
+
|
|
+$UnAlign_C1_Access_4x2:
|
|
+ VMUL t04,ALPHA,t04
|
|
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
|
|
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
|
|
+ addl C1,4*SIZE,C1
|
|
+
|
|
+$TRMMKERNEL_4x2:
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 4, TEMP
|
|
+#else
|
|
+ subl TEMP, 2, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 2 + BASE_SHIFT,KC
|
|
+ sll TEMP, 1 + BASE_SHIFT,TEMP
|
|
+
|
|
+ addl A, KC, A
|
|
+ addl B, TEMP, B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, KK
|
|
+ nop
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+.L27: # nr=2,mr=2--------------
|
|
+ and MC1,2,MC
|
|
+ beq MC,.L28
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+#else
|
|
+ sll KK, 1 + BASE_SHIFT,KC # mr=nr=2
|
|
+ nop
|
|
+ addl A,KC,A
|
|
+ addl B1,KC,B
|
|
+#endif
|
|
+
|
|
+ fclr t00 # clear 4 register
|
|
+ fclr t01
|
|
+ fclr t04
|
|
+ fclr t05
|
|
+
|
|
+ LD b0,0(B) # get 2b
|
|
+ LD b1,1*SIZE(B)
|
|
+
|
|
+ LD a0,0(A) # get 2a
|
|
+ LD a4,1*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#else
|
|
+ addl KK, 2, TEMP # mr=nr=2
|
|
+#endif
|
|
+ sra TEMP,1, KC
|
|
+ nop
|
|
+ nop
|
|
+ beq KC,$Rest_2x2x1
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1,B # Reset B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ fclr t00 # clear 4 register
|
|
+ fclr t01
|
|
+ fclr t04
|
|
+ fclr t05
|
|
+
|
|
+ LD b0,0(B) # get 2b
|
|
+ LD b1,1*SIZE(B)
|
|
+
|
|
+ LD a0,0(A) # get 2a
|
|
+ LD a4,1*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 0(C1)
|
|
+ beq KC,$Rest_2x2x1
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+$Panel_2x2x2:
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a0,b1,t04,t04
|
|
+
|
|
+ LD nb0,2*SIZE(B) # get next 2b
|
|
+ LD nb1,3*SIZE(B)
|
|
+
|
|
+ MAD a4,b0,t01,t01
|
|
+ MAD a4,b1,t05,t05
|
|
+
|
|
+ addl B,4*SIZE,B # 2(n)*2(k)
|
|
+ LD a8,2*SIZE(A) # get next 2a
|
|
+ LD a12,3*SIZE(A)
|
|
+
|
|
+ subl KC,1,KC
|
|
+ MAD a8,nb0,t00,t00
|
|
+ MAD a8,nb1,t04,t04
|
|
+
|
|
+ addl A,4*SIZE,A # 2m*2k
|
|
+ LD b0,0(B)
|
|
+ LD b1,1*SIZE(B)
|
|
+
|
|
+ MAD a12,nb0,t01,t01
|
|
+ MAD a12,nb1,t05,t05
|
|
+
|
|
+ LD a0,0(A)
|
|
+ LD a4,1*SIZE(A)
|
|
+ bne KC,$Panel_2x2x2
|
|
+
|
|
+
|
|
+$Rest_2x2x1:
|
|
+ LD ALPHA,192($sp) # Get ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,$Write_2x2
|
|
+#else
|
|
+ blbc TEMP,$Write_2x2
|
|
+#endif
|
|
+
|
|
+ addl A,2*SIZE,A # 2m*1k
|
|
+ addl B,2*SIZE,B # 2n*1K
|
|
+
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a0,b1,t04,t04
|
|
+ MAD a4,b0,t01,t01
|
|
+ MAD a4,b1,t05,t05
|
|
+
|
|
+
|
|
+$Write_2x2:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c00,0(CO)
|
|
+ LD c04,0(C1)
|
|
+ LD c01,1*SIZE(CO)
|
|
+ LD c05,1*SIZE(C1)
|
|
+
|
|
+ MAD t00,ALPHA,c00,t00
|
|
+ MAD t04,ALPHA,c04,t04
|
|
+ MAD t01,ALPHA,c01,t01
|
|
+ MAD t05,ALPHA,c05,t05
|
|
+
|
|
+ ST t00,0(CO)
|
|
+ ST t04,0(C1)
|
|
+ ST t01,1*SIZE(CO)
|
|
+ ST t05,1*SIZE(C1)
|
|
+
|
|
+ addl CO,2*SIZE,CO # 2c
|
|
+ addl C1,2*SIZE,C1
|
|
+
|
|
+#else
|
|
+
|
|
+ MUL t00,ALPHA,t00
|
|
+ MUL t04,ALPHA,t04
|
|
+ MUL t01,ALPHA,t01
|
|
+ MUL t05,ALPHA,t05
|
|
+
|
|
+ ST t00,0(CO)
|
|
+ ST t04,0(C1)
|
|
+ ST t01,1*SIZE(CO)
|
|
+ ST t05,1*SIZE(C1)
|
|
+
|
|
+ addl CO,2*SIZE,CO # 2c
|
|
+ addl C1,2*SIZE,C1
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+ subl TEMP, 2, TEMP
|
|
+
|
|
+ sll TEMP, 1 + BASE_SHIFT, KC
|
|
+ nop
|
|
+
|
|
+ addl A,KC, A
|
|
+ addl B,KC, B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+.L28:
|
|
+ and MC1,1,MC # nr=2,mr=1-------------------
|
|
+ beq MC,$End_NC_Unroll2
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+#else
|
|
+ sll KK, BASE_SHIFT,KC # mr=1
|
|
+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ fclr t00 # clear 2 registers
|
|
+ fclr t04
|
|
+
|
|
+ LD b0,0(B) # 2b
|
|
+ LD b1,1*SIZE(B)
|
|
+
|
|
+ LD a0,0(A) # 1a
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 1, TEMP
|
|
+#else
|
|
+ addl KK, 2, TEMP
|
|
+#endif
|
|
+ sra TEMP,1,KC
|
|
+ nop
|
|
+ beq KC,$Rest_1x2x1
|
|
+
|
|
+#else
|
|
+ mov B1,B # Reset B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ fclr t00 # clear 2 registers
|
|
+ fclr t04
|
|
+
|
|
+ LD b0,0(B) # 2b
|
|
+ LD b1,1*SIZE(B)
|
|
+
|
|
+ LD a0,0(A) # 1a
|
|
+ beq KC,$Rest_1x2x1
|
|
+#endif
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+$Panel_1x2x2:
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a0,b1,t04,t04
|
|
+
|
|
+ LD nb0,2*SIZE(B) # get next 2b
|
|
+ LD nb1,3*SIZE(B)
|
|
+
|
|
+ addl B,4*SIZE,B # 2(n)*2(k)
|
|
+ LD a8,1*SIZE(A) # get next 1a
|
|
+
|
|
+ subl KC,1,KC
|
|
+ MAD a8,nb0,t00,t00
|
|
+ MAD a8,nb1,t04,t04
|
|
+
|
|
+ addl A,2*SIZE,A # 1m*2k
|
|
+ LD b0,0(B) # get 3rd 2b
|
|
+ LD b1,1*SIZE(B)
|
|
+
|
|
+ LD a0,0(A) # get 3rd 1a
|
|
+ bne KC,$Panel_1x2x2
|
|
+
|
|
+
|
|
+$Rest_1x2x1:
|
|
+ LD ALPHA,192($sp) # Get ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,$Write_1x2
|
|
+#else
|
|
+ blbc TEMP,$Write_1x2
|
|
+#endif
|
|
+
|
|
+ addl A,1*SIZE,A # 1m*1k
|
|
+ addl B,2*SIZE,B # 2n*1K
|
|
+
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a0,b1,t04,t04
|
|
+
|
|
+
|
|
+$Write_1x2: # Write back 2 results
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c00,0(CO)
|
|
+ LD c04,0(C1)
|
|
+
|
|
+ MAD t00,ALPHA,c00,t00
|
|
+ MAD t04,ALPHA,c04,t04
|
|
+
|
|
+ ST t00,0(CO)
|
|
+ ST t04,0(C1)
|
|
+
|
|
+#else
|
|
+
|
|
+ MUL t00,ALPHA,t00
|
|
+ MUL t04,ALPHA,t04
|
|
+
|
|
+ ST t00,0(CO)
|
|
+ ST t04,0(C1)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 1,TEMP
|
|
+#else
|
|
+ subl TEMP, 2,TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, BASE_SHIFT,KC
|
|
+ sll TEMP, 1 + BASE_SHIFT,TEMP
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B,TEMP,B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK,1,KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+ .align 5
|
|
+
|
|
+$End_NC_Unroll2:
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 2,KK
|
|
+#endif
|
|
+ mov B, B1
|
|
+
|
|
+
|
|
+ .align 5
|
|
+$Begin_NC_Unroll1: # Nr=1
|
|
+ and NC1,1,NC # NC=NC1&1
|
|
+ beq NC,$Kernel_End
|
|
+
|
|
+ mov A1,A # Reset A
|
|
+ mov C,CO # Reset C
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET,KK # reset offset
|
|
+#endif
|
|
+
|
|
+ sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC
|
|
+ subl PREA,16*SIZE,PREA
|
|
+
|
|
+ sra MC1,4,MC # MC=MC1/16
|
|
+ beq MC,.L35 # MC=0:MC1<16
|
|
+
|
|
+
|
|
+.L3: # nr=1,mr=16
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1,B
|
|
+#else
|
|
+ sll KK, 4 + BASE_SHIFT, KC # mr=16
|
|
+ sll KK, BASE_SHIFT,TEMP # nr=1
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # CLEAR 16 Register
|
|
+ vcpys $f31,$f31,t01
|
|
+ vcpys $f31,$f31,t02
|
|
+ vcpys $f31,$f31,t03
|
|
+
|
|
+ LDDE b0,0(B) # get 1b and 16a
|
|
+
|
|
+ VLD a0,0(A)
|
|
+ VLD a4,4*SIZE(A)
|
|
+ VLD a8,8*SIZE(A)
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 4*SIZE(CO)
|
|
+ fillcs 8*SIZE(CO)
|
|
+ fillcs 12*SIZE(CO)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 16, TEMP
|
|
+#else
|
|
+ addl KK, 1, TEMP
|
|
+#endif
|
|
+ sra TEMP, 1, KC
|
|
+ beq KC,$Rest_16x1x1
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1,B # Set B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ vcpys $f31,$f31,t00 # CLEAR 16 Register
|
|
+ vcpys $f31,$f31,t01
|
|
+ vcpys $f31,$f31,t02
|
|
+ vcpys $f31,$f31,t03
|
|
+
|
|
+ LDDE b0,0(B) # get 1b and 16a
|
|
+
|
|
+ VLD a0,0(A)
|
|
+ VLD a4,4*SIZE(A)
|
|
+ VLD a8,8*SIZE(A)
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 4*SIZE(CO)
|
|
+ fillcs 8*SIZE(CO)
|
|
+ fillcs 12*SIZE(CO)
|
|
+
|
|
+ beq KC,$Rest_16x1x1
|
|
+
|
|
+#endif
|
|
+
|
|
+$Panel_16x1x2:
|
|
+ addl A,16*SIZE,A # 16(m)*1(k)
|
|
+ LDDE b1,1*SIZE(B) # get next 1b
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a4,b0,t01,t01
|
|
+
|
|
+ addl B,2*SIZE,B # 1(n)*2(k)
|
|
+ VLD na0,0(A) # get next 16a
|
|
+ VLD na4,4*SIZE(A)
|
|
+ VLD na8,8*SIZE(A)
|
|
+ VLD na12,12*SIZE(A)
|
|
+
|
|
+ VMAD a8,b0,t02,t02
|
|
+ VMAD a12,b0,t03,t03
|
|
+
|
|
+ subl KC,1,KC
|
|
+ addl A,16*SIZE,A # 16m*1k
|
|
+ LDDE b0,0(B)
|
|
+
|
|
+ VMAD na0,b1,t00,t00
|
|
+ VMAD na4,b1,t01,t01
|
|
+
|
|
+ VLD a0,0(A)
|
|
+ VLD a4,4*SIZE(A)
|
|
+ VLD a8,8*SIZE(A)
|
|
+ VLD a12,12*SIZE(A)
|
|
+
|
|
+ VMAD na8,b1,t02,t02
|
|
+ VMAD na12,b1,t03,t03
|
|
+ bne KC,$Panel_16x1x2
|
|
+
|
|
+
|
|
+$Rest_16x1x1:
|
|
+ LDDE ALPHA,192($sp)
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1
|
|
+#else
|
|
+ blbc TEMP,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1
|
|
+#endif
|
|
+
|
|
+ addl A,16*SIZE,A # 16a*1k
|
|
+ addl B,1*SIZE,B # 1b*1k
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a4,b0,t01,t01
|
|
+ VMAD a8,b0,t02,t02
|
|
+ VMAD a12,b0,t03,t03
|
|
+
|
|
+
|
|
+$Write_16x1:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_16x1
|
|
+
|
|
+$Align_CO_Access_16x1:
|
|
+ VLD c00,0(CO) # get 1st colum of 16c
|
|
+ VLD c01,4*SIZE(CO)
|
|
+ VLD c02,8*SIZE(CO)
|
|
+ VLD c03,12*SIZE(CO)
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+ VMAD t02,ALPHA,c02,t02
|
|
+ VMAD t03,ALPHA,c03,t03
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ VST t02,8*SIZE(CO)
|
|
+ VST t03,12*SIZE(CO)
|
|
+ jmp $Kernel_End
|
|
+
|
|
+$UnAlign_CO_Access_16x1:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c04, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c05, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c02, 2*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c06, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c03, 3*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c07, 4*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c04,c00
|
|
+ vbisw c01,c05,c01
|
|
+ vbisw c02,c06,c02
|
|
+ vbisw c03,c07,c03
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+ VMAD t02,ALPHA,c02,t02
|
|
+ VMAD t03,ALPHA,c03,t03
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
|
|
+ jmp $Kernel_End
|
|
+
|
|
+#else
|
|
+
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_16x1
|
|
+
|
|
+$Align_CO_Access_16x1:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+ VMUL t02,ALPHA,t02
|
|
+ VMUL t03,ALPHA,t03
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ VST t02,8*SIZE(CO)
|
|
+ VST t03,12*SIZE(CO)
|
|
+ jmp $TRMMKERNEL_16x1
|
|
+
|
|
+$UnAlign_CO_Access_16x1:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+ VMUL t02,ALPHA,t02
|
|
+ VMUL t03,ALPHA,t03
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
|
|
+
|
|
+$TRMMKERNEL_16x1:
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 16, TEMP
|
|
+#else
|
|
+ subl TEMP, 1,TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 4 + BASE_SHIFT,KC
|
|
+ sll TEMP, BASE_SHIFT, TEMP
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B,TEMP,B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 16, KK
|
|
+ nop
|
|
+#endif
|
|
+
|
|
+ jmp $Kernel_End
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+.L35: # nr=1,mr=8------------------
|
|
+ and MC1,8,MC
|
|
+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc
|
|
+ nop
|
|
+ beq MC,.L36 # MC1<8
|
|
+
|
|
+ addl A1,SPANA,PREA
|
|
+ subl PREA,8*SIZE,PREA # PREA-=MC
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+#else
|
|
+ sll KK, 3 + BASE_SHIFT,KC # mr=8
|
|
+ sll KK, BASE_SHIFT,TEMP # nr=1
|
|
+
|
|
+ addl A,KC, A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # CLEAR 8Register
|
|
+ vcpys $f31,$f31,t01
|
|
+
|
|
+ LDDE b0,0(B) # get 1b
|
|
+
|
|
+ VLD a0,0(A) # get 8a
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 4*SIZE(CO)
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK,TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 8,TEMP
|
|
+#else
|
|
+ addl KK, 1,TEMP
|
|
+#endif
|
|
+ sra TEMP,1,KC
|
|
+ nop
|
|
+ beq KC,$Rest_8x1x1
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1, B
|
|
+ sra KC1,1,KC
|
|
+ vcpys $f31,$f31,t00 # CLEAR 8Register
|
|
+ vcpys $f31,$f31,t01
|
|
+
|
|
+ LDDE b0,0(B) # get 1b
|
|
+
|
|
+ VLD a0,0(A) # get 8a
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ fillcs 4*SIZE(CO)
|
|
+ beq KC,$Rest_8x1x1
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+$Panel_8x1x2:
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a4,b0,t01,t01
|
|
+
|
|
+ LDDE nb0,1*SIZE(B) # get next 1b
|
|
+
|
|
+ addl B,2*SIZE,B # 1(n)*2k
|
|
+ VLD na8,8*SIZE(A) # get next 8a
|
|
+ VLD na12,12*SIZE(A)
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ subl PREA,8*SIZE,PREA
|
|
+
|
|
+ subl KC,1,KC
|
|
+ VMAD na8,nb0,t00,t00
|
|
+ VMAD na12,nb0,t01,t01
|
|
+
|
|
+ addl A,16*SIZE,A # 8m*2k
|
|
+ LDDE b0,0(B) # get 3rd 1b
|
|
+
|
|
+ VLD a0,0(A) # get 3rd 8a
|
|
+ VLD a4,4*SIZE(A)
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ subl PREA,8*SIZE,PREA
|
|
+ bne KC,$Panel_8x1x2
|
|
+
|
|
+
|
|
+$Rest_8x1x1:
|
|
+ LDDE ALPHA,192($sp) # Get ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,$Write_8x1
|
|
+#else
|
|
+ blbc TEMP,$Write_8x1
|
|
+#endif
|
|
+
|
|
+ addl A,8*SIZE,A # 8m*1k
|
|
+ addl B,1*SIZE,B # 1n*1k
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+ VMAD a4,b0,t01,t01
|
|
+
|
|
+
|
|
+$Write_8x1:
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_8x1
|
|
+
|
|
+$Align_CO_Access_8x1:
|
|
+ VLD c00,0(CO) # get 1st colum of 16c
|
|
+ VLD c01,4*SIZE(CO)
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ addl CO,8*SIZE,CO # 8c
|
|
+ jmp .L36
|
|
+
|
|
+$UnAlign_CO_Access_8x1:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c02, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c03, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c02,c00
|
|
+ vbisw c01,c03,c01
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VMAD t01,ALPHA,c01,t01
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+ addl CO,8*SIZE,CO # 8c
|
|
+
|
|
+#else
|
|
+
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_8x1
|
|
+
|
|
+$Align_CO_Access_8x1:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+
|
|
+ VST t00,0(CO)
|
|
+ VST t01,4*SIZE(CO)
|
|
+ jmp $TRMMKERNEL_8x1
|
|
+
|
|
+$UnAlign_CO_Access_8x1:
|
|
+ VMUL t00,ALPHA,t00
|
|
+ VMUL t01,ALPHA,t01
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
|
|
+
|
|
+$TRMMKERNEL_8x1:
|
|
+ addl CO,8*SIZE,CO # 8c
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 8, TEMP
|
|
+#else
|
|
+ subl TEMP, 1, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 3 + BASE_SHIFT, KC
|
|
+ sll TEMP, BASE_SHIFT,TEMP
|
|
+
|
|
+ addl A,KC, A
|
|
+ addl B,TEMP,B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK,8, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+.L36: # nr=1,mr=4---------------
|
|
+ and MC1,4,MC # MC1&4
|
|
+ beq MC,.L37
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA))\
|
|
+ || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+#else
|
|
+ sll KK, 2 + BASE_SHIFT, KC # mr=4
|
|
+ sll KK, BASE_SHIFT, TEMP # nr=1
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,t00 # CLEAR 4 Register
|
|
+
|
|
+ LDDE b0,0(B)
|
|
+ VLD a0,0(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 4, TEMP # mr=4
|
|
+#else
|
|
+ addl KK, 1, TEMP # nr=1
|
|
+#endif
|
|
+ sra TEMP,1, KC
|
|
+ beq KC,$Rest_4x1x1
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1,B # Reset B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ vcpys $f31,$f31,t00 # CLEAR 4 Register
|
|
+
|
|
+ LDDE b0,0(B)
|
|
+ VLD a0,0(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ beq KC,$Rest_4x1x1
|
|
+#endif
|
|
+
|
|
+
|
|
+$Panel_4x1x2:
|
|
+ VMAD a0,b0,t00,t00
|
|
+
|
|
+ LDDE nb0,1*SIZE(B)
|
|
+ VLD a4,4*SIZE(A)
|
|
+ addl B,2*SIZE,B # 1(n)*2(k)*8Byte
|
|
+
|
|
+ subl KC,1,KC
|
|
+ VMAD a4,nb0,t00,t00
|
|
+
|
|
+ addl A,8*SIZE,A # 4m*2k
|
|
+ LDDE b0,0(B)
|
|
+ VLD a0,0(A)
|
|
+
|
|
+ bne KC,$Panel_4x1x2
|
|
+
|
|
+
|
|
+$Rest_4x1x1:
|
|
+ LDDE ALPHA,192($sp) # Get ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,$Write_4x1
|
|
+#else
|
|
+ blbc TEMP,$Write_4x1
|
|
+#endif
|
|
+
|
|
+ addl A,4*SIZE,A # 4m*1k
|
|
+ addl B,1*SIZE,B # 1n*1K
|
|
+
|
|
+ VMAD a0,b0,t00,t00
|
|
+
|
|
+
|
|
+$Write_4x1: # Write back 4 results
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_4x1
|
|
+
|
|
+$Align_CO_Access_4x1:
|
|
+ VLD c00,0(CO) # get 1st colum of 16c
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+ VST t00,0(CO)
|
|
+ addl CO,4*SIZE,CO # 4c
|
|
+ jmp .L37
|
|
+
|
|
+$UnAlign_CO_Access_4x1:
|
|
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
|
|
+ VLD_UH c01, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+ vbisw c00,c01,c00
|
|
+
|
|
+ VMAD t00,ALPHA,c00,t00
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+ addl CO,4*SIZE,CO # 4c
|
|
+
|
|
+
|
|
+#else
|
|
+ and CO, (VEC_LEN*SIZE-1),$6
|
|
+ bne $6,$UnAlign_CO_Access_4x1
|
|
+
|
|
+$Align_CO_Access_4x1:
|
|
+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register
|
|
+ VST t00,0(CO)
|
|
+ jmp $TRMMKERNEL_4x1
|
|
+
|
|
+$UnAlign_CO_Access_4x1:
|
|
+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register
|
|
+
|
|
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
|
|
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
|
|
+
|
|
+$TRMMKERNEL_4x1:
|
|
+ addl CO,4*SIZE,CO # 4c
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 4, TEMP # mr=4
|
|
+#else
|
|
+ subl TEMP, 1, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 2 + BASE_SHIFT, KC
|
|
+ sll TEMP, BASE_SHIFT, TEMP
|
|
+
|
|
+ addl A, KC, A
|
|
+ addl B, TEMP,B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+.L37: # nr=1,mr=2-------------------------
|
|
+ and MC1,2,MC
|
|
+ beq MC,.L38
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+#else
|
|
+ sll KK, 1 + BASE_SHIFT,KC # mr=2
|
|
+ sll KK, BASE_SHIFT, TEMP # nr=1
|
|
+
|
|
+ addl A,KC, A
|
|
+ addl B1,TEMP,B
|
|
+#endif
|
|
+
|
|
+ fclr t00 # CLEAR 2 Register
|
|
+ fclr t01
|
|
+
|
|
+ LD b0,0(B)
|
|
+
|
|
+ LD a0,0(A)
|
|
+ LD a4,1*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 2,TEMP
|
|
+#else
|
|
+ addl KK, 1,TEMP
|
|
+#endif
|
|
+ sra TEMP,1,KC
|
|
+ beq KC,.L373
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1,B # Reset B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ fclr t00 # CLEAR 2 Register
|
|
+ fclr t01
|
|
+
|
|
+ LD b0,0(B)
|
|
+
|
|
+ LD a0,0(A)
|
|
+ LD a4,1*SIZE(A)
|
|
+
|
|
+ fillcs 0(CO) # fetch C
|
|
+ beq KC,.L373
|
|
+
|
|
+#endif
|
|
+
|
|
+.L371:
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a4,b0,t01,t01
|
|
+
|
|
+ LD nb0,1*SIZE(B)
|
|
+
|
|
+ addl B,2*SIZE,B # 1(n)*2(k)
|
|
+ LD a8,2*SIZE(A)
|
|
+ LD a12,3*SIZE(A)
|
|
+
|
|
+ subl KC,1,KC
|
|
+ MAD a8,nb0,t00,t00
|
|
+ MAD a12,nb0,t01,t01
|
|
+
|
|
+ addl A,4*SIZE,A # 2m*2k
|
|
+ LD b0,0(B)
|
|
+
|
|
+ LD a0,0(A)
|
|
+ LD a4,1*SIZE(A)
|
|
+ bne KC,.L371
|
|
+
|
|
+.L373:
|
|
+ LD ALPHA,192($sp) # Get ALPHA
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,.L374
|
|
+#else
|
|
+ blbc TEMP,.L374
|
|
+#endif
|
|
+
|
|
+ addl A,2*SIZE,A # 2m*1k*8Byte
|
|
+ addl B,1*SIZE,B # 1n*1K*8Byte
|
|
+
|
|
+ MAD a0,b0,t00,t00
|
|
+ MAD a4,b0,t01,t01
|
|
+
|
|
+.L374: # Write back 2 results
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c00,0(CO)
|
|
+ LD c01,1*SIZE(CO)
|
|
+
|
|
+ MAD t00,ALPHA,c00,t00
|
|
+ MAD t01,ALPHA,c01,t01
|
|
+
|
|
+ ST t00,0(CO)
|
|
+ ST t01,1*SIZE(CO)
|
|
+ addl CO,2*SIZE,CO # 2c
|
|
+
|
|
+#else
|
|
+
|
|
+ MUL t00,ALPHA,t00
|
|
+ MUL t01,ALPHA,t01
|
|
+
|
|
+ ST t00,0(CO)
|
|
+ ST t01,1*SIZE(CO)
|
|
+
|
|
+ addl CO,2*SIZE,CO # 2c
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 2, TEMP
|
|
+#else
|
|
+ subl TEMP, 1, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 1 + BASE_SHIFT,KC
|
|
+ sll TEMP, BASE_SHIFT,TEMP
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B,TEMP,B
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ .align 5
|
|
+.L38:
|
|
+ and MC1,1,MC
|
|
+ beq MC,$Kernel_End
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B1, B
|
|
+#else
|
|
+ sll KK, BASE_SHIFT,KC # mr=nr=1
|
|
+ nop
|
|
+
|
|
+ addl A,KC,A
|
|
+ addl B1,KC,B
|
|
+#endif
|
|
+
|
|
+ fclr t00 # CLEAR Results Register
|
|
+
|
|
+ LD b0,0(B)
|
|
+ LD a0,0(A) # Get 16 A and 4 B
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl KC1, KK, TEMP
|
|
+#else
|
|
+ addl KK, 1, TEMP # mr=nr=1
|
|
+#endif
|
|
+ sra TEMP,1,KC
|
|
+ nop
|
|
+ beq KC,.L383
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B1,B # Reset B
|
|
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
|
|
+ fclr t00 # CLEAR Results Register
|
|
+
|
|
+ LD b0,0(B)
|
|
+ LD a0,0(A) # Get 16 A and 4 B
|
|
+
|
|
+ beq KC,.L383
|
|
+#endif
|
|
+
|
|
+.L381:
|
|
+ MAD a0,b0,t00,t00
|
|
+ LD nb0,1*SIZE(B)
|
|
+
|
|
+ addl B,2*SIZE,B # 1n*2k
|
|
+ LD a8,1*SIZE(A)
|
|
+
|
|
+
|
|
+ subl KC,1,KC
|
|
+ MAD a8,nb0,t00,t00
|
|
+
|
|
+ addl A,2*SIZE,A # 1m*2k
|
|
+ LD b0,0(B)
|
|
+
|
|
+ LD a0,0(A)
|
|
+ bne KC,.L381
|
|
+
|
|
+
|
|
+.L383:
|
|
+ LD ALPHA,192($sp) # get alpha
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc KC1,.L384
|
|
+#else
|
|
+ blbc TEMP,.L384
|
|
+#endif
|
|
+
|
|
+ addl A,1*SIZE,A # 1m*1k
|
|
+ addl B,1*SIZE,B # 1n*1K
|
|
+
|
|
+ MAD a0,b0,t00,t00
|
|
+
|
|
+
|
|
+.L384: # Write back 1 results
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c00,0(CO)
|
|
+ MAD t00,ALPHA,c00,t00
|
|
+ ST t00,0(CO)
|
|
+
|
|
+#else
|
|
+ MUL t00,ALPHA,t00
|
|
+ ST t00,0(CO)
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+$Kernel_End:
|
|
+ ldl $9,328($sp) # Integer Saved Register
|
|
+ ldl $10,320($sp)
|
|
+ ldl $11,312($sp)
|
|
+ ldl $12,304($sp)
|
|
+ ldl $13,296($sp)
|
|
+ldl $14,288($sp)
|
|
+# Float Saved Register
|
|
+ LD $f2,280($sp)
|
|
+ LD $f3,272($sp)
|
|
+ LD $f4,264($sp)
|
|
+ LD $f5,256($sp)
|
|
+ LD $f6,248($sp)
|
|
+ LD $f7,240($sp)
|
|
+ LD $f8,232($sp)
|
|
+LD $f9,224($sp)
|
|
+
|
|
+ ldi $sp,STACKSIZE($sp) #
|
|
+ ret $31,($26),1 #
|
|
+
|
|
+ EPILOGUE
|
|
+
|
|
+
|
|
diff --git a/kernel/sw_64/gemv_n.S b/kernel/sw_64/gemv_n.S
|
|
new file mode 100644
|
|
index 0000000..90284db
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/gemv_n.S
|
|
@@ -0,0 +1,1647 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define STACKSIZE 72
|
|
+#define PREFETCHSIZE 32
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define A $20
|
|
+#define LDA $21
|
|
+
|
|
+#define X $18
|
|
+#define INCX $19
|
|
+#define Y $22
|
|
+#define INCY $23
|
|
+
|
|
+#define BUFFER $24
|
|
+
|
|
+#define I $25
|
|
+#define J $27
|
|
+
|
|
+#define Y1 $4
|
|
+
|
|
+#define A1 $5
|
|
+#define A2 $6
|
|
+#define A3 $7
|
|
+#define A4 $8
|
|
+
|
|
+#define alpha $f19
|
|
+
|
|
+#define alpha1 $f0
|
|
+#define alpha2 $f1
|
|
+#define alpha3 $f10
|
|
+#define alpha4 $f11
|
|
+
|
|
+#define y0 $f12
|
|
+#define y1 $f13
|
|
+#define y2 $f14
|
|
+#define y3 $f15
|
|
+
|
|
+#define y4 $f16
|
|
+#define y5 $f17
|
|
+#define y6 $f18
|
|
+#define y7 $f21
|
|
+
|
|
+#define a0 $f22
|
|
+#define a1 $f23
|
|
+#define a2 $f24
|
|
+#define a3 $f25
|
|
+#define a4 $f26
|
|
+#define a5 $f27
|
|
+#define a6 $f28
|
|
+#define a7 $f29
|
|
+
|
|
+#define a8 $f2
|
|
+#define a9 $f3
|
|
+#define a10 $f4
|
|
+#define a11 $f5
|
|
+#define a12 $f6
|
|
+#define a13 $f7
|
|
+#define a14 $f8
|
|
+#define a15 $f9
|
|
+
|
|
+#define tmp $f20
|
|
+ PROLOGUE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ ldl X, 0 + STACKSIZE($sp)
|
|
+ ldl INCX, 8 + STACKSIZE($sp)
|
|
+ ldl Y, 16 + STACKSIZE($sp)
|
|
+ ldl INCY, 24 + STACKSIZE($sp)
|
|
+ ldl BUFFER, 32 + STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ fstd tmp, 64($sp)
|
|
+ PROFCODE
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ cmple N, 0, $1
|
|
+ SXADDQ INCY, 0, INCY
|
|
+
|
|
+ or $0, $1, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ SXADDQ LDA, 0, LDA
|
|
+
|
|
+ cmpeq INCY, SIZE, $0
|
|
+ bne $0, $L10
|
|
+
|
|
+ mov BUFFER, Y1
|
|
+
|
|
+ mov Y, BUFFER
|
|
+ mov Y1, Y
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L05
|
|
+ .align 4
|
|
+
|
|
+$L02:
|
|
+ ST $f31, 0 * SIZE(Y1)
|
|
+ ST $f31, 1 * SIZE(Y1)
|
|
+ ST $f31, 2 * SIZE(Y1)
|
|
+ ST $f31, 3 * SIZE(Y1)
|
|
+ ST $f31, 4 * SIZE(Y1)
|
|
+ ST $f31, 5 * SIZE(Y1)
|
|
+ ST $f31, 6 * SIZE(Y1)
|
|
+ ST $f31, 7 * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L02
|
|
+ .align 4
|
|
+
|
|
+$L05:
|
|
+ and M, 7, I
|
|
+ ble I, $L10
|
|
+ .align 4
|
|
+
|
|
+$L06:
|
|
+ ST $f31, 0 * SIZE(Y1)
|
|
+ addl Y1, SIZE, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L06
|
|
+ .align 4
|
|
+
|
|
+$L10:
|
|
+ sra N, 2, J
|
|
+ ble J, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha3, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha4, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ MUL alpha, alpha1, tmp
|
|
+ fmov tmp, alpha1
|
|
+ MUL alpha, alpha2, tmp
|
|
+ fmov tmp, alpha2
|
|
+ MUL alpha, alpha3, tmp
|
|
+ fmov tmp, alpha3
|
|
+ MUL alpha, alpha4, tmp
|
|
+ fmov tmp, alpha4
|
|
+
|
|
+ mov A, A1
|
|
+ addl A, LDA, A2
|
|
+ addl A2, LDA, A3
|
|
+ addl A3, LDA, A4
|
|
+ s4addl LDA, A, A
|
|
+
|
|
+ mov Y, Y1
|
|
+ ldw $31, 4 * SIZE(X)
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a8, 0 * SIZE(A3)
|
|
+ LD a9, 1 * SIZE(A3)
|
|
+ LD a10, 2 * SIZE(A3)
|
|
+ LD a11, 3 * SIZE(A3)
|
|
+
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD a12, 0 * SIZE(A4)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD a13, 1 * SIZE(A4)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD a14, 2 * SIZE(A4)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ LD a15, 3 * SIZE(A4)
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ unop
|
|
+
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a1, 5 * SIZE(A1)
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ unop
|
|
+
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a2, 6 * SIZE(A1)
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ unop
|
|
+
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a3, 7 * SIZE(A1)
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+ unop
|
|
+
|
|
+ ADD y0, a4, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+ MUL alpha3, a8, tmp
|
|
+ fmov tmp, a8
|
|
+ unop
|
|
+
|
|
+ ADD y1, a5, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+ MUL alpha3, a9, tmp
|
|
+ fmov tmp, a9
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD y2, a6, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+ MUL alpha3, a10, tmp
|
|
+ fmov tmp, a10
|
|
+ unop
|
|
+
|
|
+ ADD y3, a7, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+ MUL alpha3, a11, tmp
|
|
+ fmov tmp, a11
|
|
+ unop
|
|
+
|
|
+ ADD y0, a8, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a8, 4 * SIZE(A3)
|
|
+ MUL alpha4, a12, tmp
|
|
+ fmov tmp, a12
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD y1, a9, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a9, 5 * SIZE(A3)
|
|
+ MUL alpha4, a13, tmp
|
|
+ fmov tmp, a13
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+
|
|
+ ADD y2, a10, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a10, 6 * SIZE(A3)
|
|
+ MUL alpha4, a14, tmp
|
|
+ fmov tmp, a14
|
|
+ unop
|
|
+
|
|
+ ADD y3, a11, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a11, 7 * SIZE(A3)
|
|
+ MUL alpha4, a15, tmp
|
|
+ fmov tmp, a15
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD y0, a12, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a12, 4 * SIZE(A4)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+
|
|
+ ADD y1, a13, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a13, 5 * SIZE(A4)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ unop
|
|
+
|
|
+ ADD y2, a14, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a14, 6 * SIZE(A4)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ unop
|
|
+
|
|
+ ADD y3, a15, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a15, 7 * SIZE(A4)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+
|
|
+ ADD y4, a0, tmp
|
|
+ fmov tmp, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD y5, a1, tmp
|
|
+ fmov tmp, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD y6, a2, tmp
|
|
+ fmov tmp, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD y7, a3, tmp
|
|
+ fmov tmp, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD y4, a4, tmp
|
|
+ fmov tmp, y4
|
|
+ LD a4, 8 * SIZE(A2)
|
|
+ MUL alpha3, a8, tmp
|
|
+ fmov tmp, a8
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD y5, a5, tmp
|
|
+ fmov tmp, y5
|
|
+ LD a5, 9 * SIZE(A2)
|
|
+ MUL alpha3, a9, tmp
|
|
+ fmov tmp, a9
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+
|
|
+ ADD y6, a6, tmp
|
|
+ fmov tmp, y6
|
|
+ LD a6, 10 * SIZE(A2)
|
|
+ MUL alpha3, a10, tmp
|
|
+ fmov tmp, a10
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+
|
|
+ ADD y7, a7, tmp
|
|
+ fmov tmp, y7
|
|
+ LD a7, 11 * SIZE(A2)
|
|
+ MUL alpha3, a11, tmp
|
|
+ fmov tmp, a11
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+
|
|
+ ADD y4, a8, tmp
|
|
+ fmov tmp, y4
|
|
+ LD a8, 8 * SIZE(A3)
|
|
+ MUL alpha4, a12, tmp
|
|
+ fmov tmp, a12
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A3)
|
|
+
|
|
+ ADD y5, a9, tmp
|
|
+ fmov tmp, y5
|
|
+ LD a9, 9 * SIZE(A3)
|
|
+ MUL alpha4, a13, tmp
|
|
+ fmov tmp, a13
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD y6, a10, tmp
|
|
+ fmov tmp, y6
|
|
+ LD a10, 10 * SIZE(A3)
|
|
+ MUL alpha4, a14, tmp
|
|
+ fmov tmp, a14
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD y7, a11, tmp
|
|
+ fmov tmp, y7
|
|
+ LD a11, 11 * SIZE(A3)
|
|
+ MUL alpha4, a15, tmp
|
|
+ fmov tmp, a15
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD y4, a12, tmp
|
|
+ fmov tmp, y4
|
|
+ LD a12, 8 * SIZE(A4)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ unop
|
|
+
|
|
+ ADD y5, a13, tmp
|
|
+ fmov tmp, y5
|
|
+ LD a13, 9 * SIZE(A4)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ ldi A3, 8 * SIZE(A3)
|
|
+
|
|
+ ADD y6, a14, tmp
|
|
+ fmov tmp, y6
|
|
+ LD a14, 10 * SIZE(A4)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A4)
|
|
+
|
|
+ ADD y7, a15, tmp
|
|
+ fmov tmp, y7
|
|
+ LD a15, 11 * SIZE(A4)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ ldi A4, 8 * SIZE(A4)
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ ST y4, -4 * SIZE(Y1)
|
|
+
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a1, 5 * SIZE(A1)
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ ST y5, -3 * SIZE(Y1)
|
|
+
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a2, 6 * SIZE(A1)
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ ST y6, -2 * SIZE(Y1)
|
|
+
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a3, 7 * SIZE(A1)
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+ ST y7, -1 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a4, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+ MUL alpha3, a8, tmp
|
|
+ fmov tmp, a8
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+
|
|
+ ADD y1, a5, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+ MUL alpha3, a9, tmp
|
|
+ fmov tmp, a9
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+
|
|
+ ADD y2, a6, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+ MUL alpha3, a10, tmp
|
|
+ fmov tmp, a10
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+
|
|
+ ADD y3, a7, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+ MUL alpha3, a11, tmp
|
|
+ fmov tmp, a11
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a8, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a8, 4 * SIZE(A3)
|
|
+ MUL alpha4, a12, tmp
|
|
+ fmov tmp, a12
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD y1, a9, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a9, 5 * SIZE(A3)
|
|
+ MUL alpha4, a13, tmp
|
|
+ fmov tmp, a13
|
|
+ unop
|
|
+
|
|
+ ADD y2, a10, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a10, 6 * SIZE(A3)
|
|
+ MUL alpha4, a14, tmp
|
|
+ fmov tmp, a14
|
|
+ unop
|
|
+
|
|
+ ADD y3, a11, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a11, 7 * SIZE(A3)
|
|
+ MUL alpha4, a15, tmp
|
|
+ fmov tmp, a15
|
|
+ unop
|
|
+
|
|
+ ADD y0, a12, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a12, 4 * SIZE(A4)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ unop
|
|
+
|
|
+ ADD y1, a13, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a13, 5 * SIZE(A4)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ unop
|
|
+
|
|
+ ADD y2, a14, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a14, 6 * SIZE(A4)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ unop
|
|
+
|
|
+ ADD y3, a15, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a15, 7 * SIZE(A4)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ unop
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ADD y4, a0, tmp
|
|
+ fmov tmp, y4
|
|
+ unop
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ADD y5, a1, tmp
|
|
+ fmov tmp, y5
|
|
+ unop
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ ADD y6, a2, tmp
|
|
+ fmov tmp, y6
|
|
+ unop
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ ADD y7, a3, tmp
|
|
+ fmov tmp, y7
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+
|
|
+ ADD y4, a4, tmp
|
|
+ fmov tmp, y4
|
|
+ MUL alpha3, a8, tmp
|
|
+ fmov tmp, a8
|
|
+ ADD y5, a5, tmp
|
|
+ fmov tmp, y5
|
|
+ MUL alpha3, a9, tmp
|
|
+ fmov tmp, a9
|
|
+ ADD y6, a6, tmp
|
|
+ fmov tmp, y6
|
|
+ MUL alpha3, a10, tmp
|
|
+ fmov tmp, a10
|
|
+ ADD y7, a7, tmp
|
|
+ fmov tmp, y7
|
|
+ MUL alpha3, a11, tmp
|
|
+ fmov tmp, a11
|
|
+
|
|
+ ADD y4, a8, tmp
|
|
+ fmov tmp, y4
|
|
+ MUL alpha4, a12, tmp
|
|
+ fmov tmp, a12
|
|
+ ADD y5, a9, tmp
|
|
+ fmov tmp, y5
|
|
+ MUL alpha4, a13, tmp
|
|
+ fmov tmp, a13
|
|
+ ADD y6, a10, tmp
|
|
+ fmov tmp, y6
|
|
+ MUL alpha4, a14, tmp
|
|
+ fmov tmp, a14
|
|
+ ADD y7, a11, tmp
|
|
+ fmov tmp, y7
|
|
+ MUL alpha4, a15, tmp
|
|
+ fmov tmp, a15
|
|
+
|
|
+ ADD y4, a12, tmp
|
|
+ fmov tmp, y4
|
|
+ ADD y5, a13, tmp
|
|
+ fmov tmp, y5
|
|
+ ADD y6, a14, tmp
|
|
+ fmov tmp, y6
|
|
+ ADD y7, a15, tmp
|
|
+ fmov tmp, y7
|
|
+
|
|
+ ST y4, -4 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y5, -3 * SIZE(Y1)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ ST y6, -2 * SIZE(Y1)
|
|
+ ldi A3, 8 * SIZE(A3)
|
|
+ ST y7, -1 * SIZE(Y1)
|
|
+ ldi A4, 8 * SIZE(A4)
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and M, 4, I
|
|
+ ble I, $L16
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD a8, 0 * SIZE(A3)
|
|
+ LD a9, 1 * SIZE(A3)
|
|
+ LD a10, 2 * SIZE(A3)
|
|
+ LD a11, 3 * SIZE(A3)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD a12, 0 * SIZE(A4)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD a13, 1 * SIZE(A4)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD a14, 2 * SIZE(A4)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ LD a15, 3 * SIZE(A4)
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+
|
|
+ ADD y0, a4, tmp
|
|
+ fmov tmp, y0
|
|
+ MUL alpha3, a8, tmp
|
|
+ fmov tmp, a8
|
|
+ ADD y1, a5, tmp
|
|
+ fmov tmp, y1
|
|
+ MUL alpha3, a9, tmp
|
|
+ fmov tmp, a9
|
|
+ ADD y2, a6, tmp
|
|
+ fmov tmp, y2
|
|
+ MUL alpha3, a10, tmp
|
|
+ fmov tmp, a10
|
|
+ ADD y3, a7, tmp
|
|
+ fmov tmp, y3
|
|
+ MUL alpha3, a11, tmp
|
|
+ fmov tmp, a11
|
|
+
|
|
+ ADD y0, a8, tmp
|
|
+ fmov tmp, y0
|
|
+ MUL alpha4, a12, tmp
|
|
+ fmov tmp, a12
|
|
+ ADD y1, a9, tmp
|
|
+ fmov tmp, y1
|
|
+ MUL alpha4, a13, tmp
|
|
+ fmov tmp, a13
|
|
+ ADD y2, a10, tmp
|
|
+ fmov tmp, y2
|
|
+ MUL alpha4, a14, tmp
|
|
+ fmov tmp, a14
|
|
+ ADD y3, a11, tmp
|
|
+ fmov tmp, y3
|
|
+ MUL alpha4, a15, tmp
|
|
+ fmov tmp, a15
|
|
+
|
|
+ ADD y0, a12, tmp
|
|
+ fmov tmp, y0
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ ADD y1, a13, tmp
|
|
+ fmov tmp, y1
|
|
+ unop
|
|
+
|
|
+ ADD y2, a14, tmp
|
|
+ fmov tmp, y2
|
|
+ unop
|
|
+ ADD y3, a15, tmp
|
|
+ fmov tmp, y3
|
|
+ unop
|
|
+
|
|
+ ST y0, -4 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y1, -3 * SIZE(Y1)
|
|
+ ldi A2, 4 * SIZE(A2)
|
|
+ ST y2, -2 * SIZE(Y1)
|
|
+ ldi A3, 4 * SIZE(A3)
|
|
+ ST y3, -1 * SIZE(Y1)
|
|
+ ldi A4, 4 * SIZE(A4)
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ and M, 2, I
|
|
+ ble I, $L17
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A3)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD a5, 1 * SIZE(A3)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD a6, 0 * SIZE(A4)
|
|
+ MUL alpha2, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD a7, 1 * SIZE(A4)
|
|
+ MUL alpha2, a3, tmp
|
|
+ fmov tmp, a3
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ MUL alpha3, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ MUL alpha3, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ ADD y0, a2, tmp
|
|
+ fmov tmp, y0
|
|
+ MUL alpha4, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ ADD y1, a3, tmp
|
|
+ fmov tmp, y1
|
|
+ MUL alpha4, a7, tmp
|
|
+ fmov tmp, a7
|
|
+
|
|
+ ADD y0, a4, tmp
|
|
+ fmov tmp, y0
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ ADD y1, a5, tmp
|
|
+ fmov tmp, y1
|
|
+ ldi A2, 2 * SIZE(A2)
|
|
+ ADD y0, a6, tmp
|
|
+ fmov tmp, y0
|
|
+ ldi A3, 2 * SIZE(A3)
|
|
+ ADD y1, a7, tmp
|
|
+ fmov tmp, y1
|
|
+ ldi A4, 2 * SIZE(A4)
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi Y1, 2 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ blbc M, $L18
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD a2, 0 * SIZE(A3)
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ MUL alpha2, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ MUL alpha3, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ MUL alpha4, a3, tmp
|
|
+ fmov tmp, a3
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ ADD y0, a1, tmp
|
|
+ fmov tmp, y0
|
|
+ ADD y0, a2, tmp
|
|
+ fmov tmp, y0
|
|
+ ADD y0, a3, tmp
|
|
+ fmov tmp, y0
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and N, 2, J
|
|
+ ble J, $L30
|
|
+
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ mov A, A1
|
|
+ MUL alpha, alpha1, tmp
|
|
+ fmov tmp, alpha1
|
|
+ addl A, LDA, A2
|
|
+ MUL alpha, alpha2, tmp
|
|
+ fmov tmp, alpha2
|
|
+
|
|
+ addl A2, LDA, A
|
|
+ mov Y, Y1
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a1, 5 * SIZE(A1)
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a2, 6 * SIZE(A1)
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a3, 7 * SIZE(A1)
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+
|
|
+ ADD y0, a4, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+
|
|
+ ADD y1, a5, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+
|
|
+ ADD y2, a6, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+
|
|
+ ADD y3, a7, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ ldi I, -1(I)
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD y4, a0, tmp
|
|
+ fmov tmp, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD y5, a1, tmp
|
|
+ fmov tmp, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD y6, a2, tmp
|
|
+ fmov tmp, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD y7, a3, tmp
|
|
+ fmov tmp, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD y4, a4, tmp
|
|
+ fmov tmp, y4
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD y5, a5, tmp
|
|
+ fmov tmp, y5
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+
|
|
+ ADD y6, a6, tmp
|
|
+ fmov tmp, y6
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+
|
|
+ ADD y7, a7, tmp
|
|
+ fmov tmp, y7
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ LD a0, 12 * SIZE(A1)
|
|
+
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ LD a1, 13 * SIZE(A1)
|
|
+
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ LD a2, 14 * SIZE(A1)
|
|
+
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+ LD a3, 15 * SIZE(A1)
|
|
+
|
|
+ ADD y0, a4, tmp
|
|
+ fmov tmp, y0
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD y4, 12 * SIZE(Y1)
|
|
+
|
|
+ ADD y1, a5, tmp
|
|
+ fmov tmp, y1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD y5, 13 * SIZE(Y1)
|
|
+
|
|
+ ADD y2, a6, tmp
|
|
+ fmov tmp, y2
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD y6, 14 * SIZE(Y1)
|
|
+
|
|
+ ADD y3, a7, tmp
|
|
+ fmov tmp, y3
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ LD y7, 15 * SIZE(Y1)
|
|
+
|
|
+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD y4, a0, tmp
|
|
+ fmov tmp, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ unop
|
|
+
|
|
+ ADD y5, a1, tmp
|
|
+ fmov tmp, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ unop
|
|
+
|
|
+ ADD y6, a2, tmp
|
|
+ fmov tmp, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ unop
|
|
+
|
|
+ ADD y7, a3, tmp
|
|
+ fmov tmp, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+ unop
|
|
+
|
|
+ ADD y4, a4, tmp
|
|
+ fmov tmp, y4
|
|
+ ADD y5, a5, tmp
|
|
+ fmov tmp, y5
|
|
+ ADD y6, a6, tmp
|
|
+ fmov tmp, y6
|
|
+ ADD y7, a7, tmp
|
|
+ fmov tmp, y7
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and M, 4, I
|
|
+ ble I, $L26
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ MUL alpha2, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ MUL alpha2, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ MUL alpha2, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+ MUL alpha2, a7, tmp
|
|
+ fmov tmp, a7
|
|
+
|
|
+ ADD y0, a4, tmp
|
|
+ fmov tmp, y0
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ ADD y1, a5, tmp
|
|
+ fmov tmp, y1
|
|
+ unop
|
|
+ ADD y2, a6, tmp
|
|
+ fmov tmp, y2
|
|
+ unop
|
|
+ ADD y3, a7, tmp
|
|
+ fmov tmp, y3
|
|
+ unop
|
|
+
|
|
+ ST y0, -4 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y1, -3 * SIZE(Y1)
|
|
+ ldi A2, 4 * SIZE(A2)
|
|
+ ST y2, -2 * SIZE(Y1)
|
|
+ ldi A3, 4 * SIZE(A3)
|
|
+ ST y3, -1 * SIZE(Y1)
|
|
+ ldi A4, 4 * SIZE(A4)
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ and M, 2, I
|
|
+ ble I, $L27
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ MUL alpha2, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ MUL alpha2, a3, tmp
|
|
+ fmov tmp, a3
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ ldi A2, 2 * SIZE(A2)
|
|
+ ADD y0, a2, tmp
|
|
+ fmov tmp, y0
|
|
+ unop
|
|
+ ADD y1, a3, tmp
|
|
+ fmov tmp, y1
|
|
+ unop
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi Y1, 2 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ blbc M, $L30
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ MUL alpha2, a1, tmp
|
|
+ fmov tmp, a1
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ ADD y0, a1, tmp
|
|
+ fmov tmp, y0
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ blbc N, $L990
|
|
+
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ mov A, A1
|
|
+ MUL alpha, alpha1, tmp
|
|
+ fmov tmp, alpha1
|
|
+ mov Y, Y1
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L35
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+ LD a4, 4 * SIZE(A1)
|
|
+ LD a5, 5 * SIZE(A1)
|
|
+ LD a6, 6 * SIZE(A1)
|
|
+ LD a7, 7 * SIZE(A1)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L33
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ MUL alpha1, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ MUL alpha1, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ MUL alpha1, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ MUL alpha1, a7, tmp
|
|
+ fmov tmp, a7
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD y4, a4, tmp
|
|
+ fmov tmp, y4
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD a4, 12 * SIZE(A1)
|
|
+
|
|
+ ADD y5, a5, tmp
|
|
+ fmov tmp, y5
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD a5, 13 * SIZE(A1)
|
|
+
|
|
+ ADD y6, a6, tmp
|
|
+ fmov tmp, y6
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD a6, 14 * SIZE(A1)
|
|
+
|
|
+ ADD y7, a7, tmp
|
|
+ fmov tmp, y7
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ LD a7, 15 * SIZE(A1)
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ ldi I, -1(I)
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ bgt I, $L32
|
|
+ .align 4
|
|
+
|
|
+$L33:
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ MUL alpha1, a4, tmp
|
|
+ fmov tmp, a4
|
|
+ unop
|
|
+
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ MUL alpha1, a5, tmp
|
|
+ fmov tmp, a5
|
|
+ unop
|
|
+
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ MUL alpha1, a6, tmp
|
|
+ fmov tmp, a6
|
|
+ unop
|
|
+
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ MUL alpha1, a7, tmp
|
|
+ fmov tmp, a7
|
|
+ unop
|
|
+
|
|
+ ADD y4, a4, tmp
|
|
+ fmov tmp, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ADD y5, a5, tmp
|
|
+ fmov tmp, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ADD y6, a6, tmp
|
|
+ fmov tmp, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ ADD y7, a7, tmp
|
|
+ fmov tmp, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ unop
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ and M, 4, I
|
|
+ ble I, $L36
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ MUL alpha1, a2, tmp
|
|
+ fmov tmp, a2
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ MUL alpha1, a3, tmp
|
|
+ fmov tmp, a3
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+ ADD y2, a2, tmp
|
|
+ fmov tmp, y2
|
|
+ ADD y3, a3, tmp
|
|
+ fmov tmp, y3
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi A2, 4 * SIZE(A2)
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L36:
|
|
+ and M, 2, I
|
|
+ ble I, $L37
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ MUL alpha1, a1, tmp
|
|
+ fmov tmp, a1
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ ADD y1, a1, tmp
|
|
+ fmov tmp, y1
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi Y1, 2 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ blbc M, $L990
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, tmp
|
|
+ fmov tmp, a0
|
|
+
|
|
+ ADD y0, a0, tmp
|
|
+ fmov tmp, y0
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L990:
|
|
+ cmpeq INCY, SIZE, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ mov BUFFER, Y1
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L995
|
|
+ .align 4
|
|
+
|
|
+$L992:
|
|
+ LD a0, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a1, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a2, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a3, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y0, 0 * SIZE(Y)
|
|
+ LD y1, 1 * SIZE(Y)
|
|
+ LD y2, 2 * SIZE(Y)
|
|
+ LD y3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a5, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a6, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a7, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y4, 4 * SIZE(Y)
|
|
+ LD y5, 5 * SIZE(Y)
|
|
+ LD y6, 6 * SIZE(Y)
|
|
+ LD y7, 7 * SIZE(Y)
|
|
+
|
|
+ ADD a0, y0, tmp
|
|
+ fmov tmp, a0
|
|
+ ADD a1, y1, tmp
|
|
+ fmov tmp, a1
|
|
+ ADD a2, y2, tmp
|
|
+ fmov tmp, a2
|
|
+ ADD a3, y3, tmp
|
|
+ fmov tmp, a3
|
|
+ ADD a4, y4, tmp
|
|
+ fmov tmp, a4
|
|
+ ADD a5, y5, tmp
|
|
+ fmov tmp, a5
|
|
+ ADD a6, y6, tmp
|
|
+ fmov tmp, a6
|
|
+ ADD a7, y7, tmp
|
|
+ fmov tmp, a7
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a1, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a2, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a3, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ST a4, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a5, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a6, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a7, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ldi Y, 8 * SIZE(Y)
|
|
+ bgt I, $L992
|
|
+ .align 4
|
|
+
|
|
+$L995:
|
|
+ and M, 7, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L996:
|
|
+ LD a0, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y0, 0 * SIZE(Y)
|
|
+ ldi Y, 1 * SIZE(Y)
|
|
+
|
|
+ ADD a0, y0, tmp
|
|
+ fmov tmp, a0
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L996
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ fldd $f20, 64($sp)
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/gemv_n.S.bak b/kernel/sw_64/gemv_n.S.bak
|
|
new file mode 100644
|
|
index 0000000..f90abdf
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/gemv_n.S.bak
|
|
@@ -0,0 +1,1307 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define STACKSIZE 64
|
|
+#define PREFETCHSIZE 32
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define A $20
|
|
+#define LDA $21
|
|
+
|
|
+#define X $18
|
|
+#define INCX $19
|
|
+#define Y $22
|
|
+#define INCY $23
|
|
+
|
|
+#define BUFFER $24
|
|
+
|
|
+#define I $25
|
|
+#define J $27
|
|
+
|
|
+#define Y1 $4
|
|
+
|
|
+#define A1 $5
|
|
+#define A2 $6
|
|
+#define A3 $7
|
|
+#define A4 $8
|
|
+
|
|
+#define alpha $f19
|
|
+
|
|
+#define alpha1 $f0
|
|
+#define alpha2 $f1
|
|
+#define alpha3 $f10
|
|
+#define alpha4 $f11
|
|
+
|
|
+#define y0 $f12
|
|
+#define y1 $f13
|
|
+#define y2 $f14
|
|
+#define y3 $f15
|
|
+
|
|
+#define y4 $f16
|
|
+#define y5 $f17
|
|
+#define y6 $f18
|
|
+#define y7 $f21
|
|
+
|
|
+#define a0 $f22
|
|
+#define a1 $f23
|
|
+#define a2 $f24
|
|
+#define a3 $f25
|
|
+#define a4 $f26
|
|
+#define a5 $f27
|
|
+#define a6 $f28
|
|
+#define a7 $f29
|
|
+
|
|
+#define a8 $f2
|
|
+#define a9 $f3
|
|
+#define a10 $f4
|
|
+#define a11 $f5
|
|
+#define a12 $f6
|
|
+#define a13 $f7
|
|
+#define a14 $f8
|
|
+#define a15 $f9
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ ldl X, 0 + STACKSIZE($sp)
|
|
+ ldl INCX, 8 + STACKSIZE($sp)
|
|
+ ldl Y, 16 + STACKSIZE($sp)
|
|
+ ldl INCY, 24 + STACKSIZE($sp)
|
|
+ ldl BUFFER, 32 + STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ cmple N, 0, $1
|
|
+ SXADDQ INCY, 0, INCY
|
|
+
|
|
+ or $0, $1, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ SXADDQ LDA, 0, LDA
|
|
+
|
|
+ cmpeq INCY, SIZE, $0
|
|
+ bne $0, $L10
|
|
+
|
|
+ mov BUFFER, Y1
|
|
+
|
|
+ mov Y, BUFFER
|
|
+ mov Y1, Y
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L05
|
|
+ .align 4
|
|
+
|
|
+$L02:
|
|
+ ST $f31, 0 * SIZE(Y1)
|
|
+ ST $f31, 1 * SIZE(Y1)
|
|
+ ST $f31, 2 * SIZE(Y1)
|
|
+ ST $f31, 3 * SIZE(Y1)
|
|
+ ST $f31, 4 * SIZE(Y1)
|
|
+ ST $f31, 5 * SIZE(Y1)
|
|
+ ST $f31, 6 * SIZE(Y1)
|
|
+ ST $f31, 7 * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L02
|
|
+ .align 4
|
|
+
|
|
+$L05:
|
|
+ and M, 7, I
|
|
+ ble I, $L10
|
|
+ .align 4
|
|
+
|
|
+$L06:
|
|
+ ST $f31, 0 * SIZE(Y1)
|
|
+ addl Y1, SIZE, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L06
|
|
+ .align 4
|
|
+
|
|
+$L10:
|
|
+ sra N, 2, J
|
|
+ ble J, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha3, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha4, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ MUL alpha, alpha1, alpha1
|
|
+ MUL alpha, alpha2, alpha2
|
|
+ MUL alpha, alpha3, alpha3
|
|
+ MUL alpha, alpha4, alpha4
|
|
+
|
|
+ mov A, A1
|
|
+ addl A, LDA, A2
|
|
+ addl A2, LDA, A3
|
|
+ addl A3, LDA, A4
|
|
+ s4addl LDA, A, A
|
|
+
|
|
+ mov Y, Y1
|
|
+ fillcs 4 * SIZE(X)
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a8, 0 * SIZE(A3)
|
|
+ LD a9, 1 * SIZE(A3)
|
|
+ LD a10, 2 * SIZE(A3)
|
|
+ LD a11, 3 * SIZE(A3)
|
|
+
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ LD a12, 0 * SIZE(A4)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD a13, 1 * SIZE(A4)
|
|
+ MUL alpha1, a2, a2
|
|
+ LD a14, 2 * SIZE(A4)
|
|
+ MUL alpha1, a3, a3
|
|
+ LD a15, 3 * SIZE(A4)
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+ MUL alpha2, a4, a4
|
|
+ unop
|
|
+
|
|
+ ADD y1, a1, y1
|
|
+ LD a1, 5 * SIZE(A1)
|
|
+ MUL alpha2, a5, a5
|
|
+ unop
|
|
+
|
|
+ ADD y2, a2, y2
|
|
+ LD a2, 6 * SIZE(A1)
|
|
+ MUL alpha2, a6, a6
|
|
+ unop
|
|
+
|
|
+ ADD y3, a3, y3
|
|
+ LD a3, 7 * SIZE(A1)
|
|
+ MUL alpha2, a7, a7
|
|
+ unop
|
|
+
|
|
+ ADD y0, a4, y0
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+ MUL alpha3, a8, a8
|
|
+ unop
|
|
+
|
|
+ ADD y1, a5, y1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+ MUL alpha3, a9, a9
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD y2, a6, y2
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+ MUL alpha3, a10, a10
|
|
+ unop
|
|
+
|
|
+ ADD y3, a7, y3
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+ MUL alpha3, a11, a11
|
|
+ unop
|
|
+
|
|
+ ADD y0, a8, y0
|
|
+ LD a8, 4 * SIZE(A3)
|
|
+ MUL alpha4, a12, a12
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD y1, a9, y1
|
|
+ LD a9, 5 * SIZE(A3)
|
|
+ MUL alpha4, a13, a13
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+
|
|
+ ADD y2, a10, y2
|
|
+ LD a10, 6 * SIZE(A3)
|
|
+ MUL alpha4, a14, a14
|
|
+ unop
|
|
+
|
|
+ ADD y3, a11, y3
|
|
+ LD a11, 7 * SIZE(A3)
|
|
+ MUL alpha4, a15, a15
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD y0, a12, y0
|
|
+ LD a12, 4 * SIZE(A4)
|
|
+ MUL alpha1, a0, a0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+
|
|
+ ADD y1, a13, y1
|
|
+ LD a13, 5 * SIZE(A4)
|
|
+ MUL alpha1, a1, a1
|
|
+ unop
|
|
+
|
|
+ ADD y2, a14, y2
|
|
+ LD a14, 6 * SIZE(A4)
|
|
+ MUL alpha1, a2, a2
|
|
+ unop
|
|
+
|
|
+ ADD y3, a15, y3
|
|
+ LD a15, 7 * SIZE(A4)
|
|
+ MUL alpha1, a3, a3
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+
|
|
+ ADD y4, a0, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a4, a4
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD y5, a1, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a5, a5
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD y6, a2, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a6, a6
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD y7, a3, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a7, a7
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD y4, a4, y4
|
|
+ LD a4, 8 * SIZE(A2)
|
|
+ MUL alpha3, a8, a8
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD y5, a5, y5
|
|
+ LD a5, 9 * SIZE(A2)
|
|
+ MUL alpha3, a9, a9
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+
|
|
+ ADD y6, a6, y6
|
|
+ LD a6, 10 * SIZE(A2)
|
|
+ MUL alpha3, a10, a10
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+
|
|
+ ADD y7, a7, y7
|
|
+ LD a7, 11 * SIZE(A2)
|
|
+ MUL alpha3, a11, a11
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+
|
|
+ ADD y4, a8, y4
|
|
+ LD a8, 8 * SIZE(A3)
|
|
+ MUL alpha4, a12, a12
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A3)
|
|
+
|
|
+ ADD y5, a9, y5
|
|
+ LD a9, 9 * SIZE(A3)
|
|
+ MUL alpha4, a13, a13
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD y6, a10, y6
|
|
+ LD a10, 10 * SIZE(A3)
|
|
+ MUL alpha4, a14, a14
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD y7, a11, y7
|
|
+ LD a11, 11 * SIZE(A3)
|
|
+ MUL alpha4, a15, a15
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD y4, a12, y4
|
|
+ LD a12, 8 * SIZE(A4)
|
|
+ MUL alpha1, a0, a0
|
|
+ unop
|
|
+
|
|
+ ADD y5, a13, y5
|
|
+ LD a13, 9 * SIZE(A4)
|
|
+ MUL alpha1, a1, a1
|
|
+ ldi A3, 8 * SIZE(A3)
|
|
+
|
|
+ ADD y6, a14, y6
|
|
+ LD a14, 10 * SIZE(A4)
|
|
+ MUL alpha1, a2, a2
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A4)
|
|
+
|
|
+ ADD y7, a15, y7
|
|
+ LD a15, 11 * SIZE(A4)
|
|
+ MUL alpha1, a3, a3
|
|
+ ldi A4, 8 * SIZE(A4)
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+ MUL alpha2, a4, a4
|
|
+ ST y4, -4 * SIZE(Y1)
|
|
+
|
|
+ ADD y1, a1, y1
|
|
+ LD a1, 5 * SIZE(A1)
|
|
+ MUL alpha2, a5, a5
|
|
+ ST y5, -3 * SIZE(Y1)
|
|
+
|
|
+ ADD y2, a2, y2
|
|
+ LD a2, 6 * SIZE(A1)
|
|
+ MUL alpha2, a6, a6
|
|
+ ST y6, -2 * SIZE(Y1)
|
|
+
|
|
+ ADD y3, a3, y3
|
|
+ LD a3, 7 * SIZE(A1)
|
|
+ MUL alpha2, a7, a7
|
|
+ ST y7, -1 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a4, y0
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+ MUL alpha3, a8, a8
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+
|
|
+ ADD y1, a5, y1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+ MUL alpha3, a9, a9
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+
|
|
+ ADD y2, a6, y2
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+ MUL alpha3, a10, a10
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+
|
|
+ ADD y3, a7, y3
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+ MUL alpha3, a11, a11
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a8, y0
|
|
+ LD a8, 4 * SIZE(A3)
|
|
+ MUL alpha4, a12, a12
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD y1, a9, y1
|
|
+ LD a9, 5 * SIZE(A3)
|
|
+ MUL alpha4, a13, a13
|
|
+ unop
|
|
+
|
|
+ ADD y2, a10, y2
|
|
+ LD a10, 6 * SIZE(A3)
|
|
+ MUL alpha4, a14, a14
|
|
+ unop
|
|
+
|
|
+ ADD y3, a11, y3
|
|
+ LD a11, 7 * SIZE(A3)
|
|
+ MUL alpha4, a15, a15
|
|
+ unop
|
|
+
|
|
+ ADD y0, a12, y0
|
|
+ LD a12, 4 * SIZE(A4)
|
|
+ MUL alpha1, a0, a0
|
|
+ unop
|
|
+
|
|
+ ADD y1, a13, y1
|
|
+ LD a13, 5 * SIZE(A4)
|
|
+ MUL alpha1, a1, a1
|
|
+ unop
|
|
+
|
|
+ ADD y2, a14, y2
|
|
+ LD a14, 6 * SIZE(A4)
|
|
+ MUL alpha1, a2, a2
|
|
+ unop
|
|
+
|
|
+ ADD y3, a15, y3
|
|
+ LD a15, 7 * SIZE(A4)
|
|
+ MUL alpha1, a3, a3
|
|
+ unop
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ADD y4, a0, y4
|
|
+ unop
|
|
+ MUL alpha2, a4, a4
|
|
+
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ADD y5, a1, y5
|
|
+ unop
|
|
+ MUL alpha2, a5, a5
|
|
+
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ ADD y6, a2, y6
|
|
+ unop
|
|
+ MUL alpha2, a6, a6
|
|
+
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ ADD y7, a3, y7
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ MUL alpha2, a7, a7
|
|
+
|
|
+ ADD y4, a4, y4
|
|
+ MUL alpha3, a8, a8
|
|
+ ADD y5, a5, y5
|
|
+ MUL alpha3, a9, a9
|
|
+ ADD y6, a6, y6
|
|
+ MUL alpha3, a10, a10
|
|
+ ADD y7, a7, y7
|
|
+ MUL alpha3, a11, a11
|
|
+
|
|
+ ADD y4, a8, y4
|
|
+ MUL alpha4, a12, a12
|
|
+ ADD y5, a9, y5
|
|
+ MUL alpha4, a13, a13
|
|
+ ADD y6, a10, y6
|
|
+ MUL alpha4, a14, a14
|
|
+ ADD y7, a11, y7
|
|
+ MUL alpha4, a15, a15
|
|
+
|
|
+ ADD y4, a12, y4
|
|
+ ADD y5, a13, y5
|
|
+ ADD y6, a14, y6
|
|
+ ADD y7, a15, y7
|
|
+
|
|
+ ST y4, -4 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y5, -3 * SIZE(Y1)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ ST y6, -2 * SIZE(Y1)
|
|
+ ldi A3, 8 * SIZE(A3)
|
|
+ ST y7, -1 * SIZE(Y1)
|
|
+ ldi A4, 8 * SIZE(A4)
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and M, 4, I
|
|
+ ble I, $L16
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD a8, 0 * SIZE(A3)
|
|
+ LD a9, 1 * SIZE(A3)
|
|
+ LD a10, 2 * SIZE(A3)
|
|
+ LD a11, 3 * SIZE(A3)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ LD a12, 0 * SIZE(A4)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD a13, 1 * SIZE(A4)
|
|
+ MUL alpha1, a2, a2
|
|
+ LD a14, 2 * SIZE(A4)
|
|
+ MUL alpha1, a3, a3
|
|
+ LD a15, 3 * SIZE(A4)
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ MUL alpha2, a4, a4
|
|
+ ADD y1, a1, y1
|
|
+ MUL alpha2, a5, a5
|
|
+ ADD y2, a2, y2
|
|
+ MUL alpha2, a6, a6
|
|
+ ADD y3, a3, y3
|
|
+ MUL alpha2, a7, a7
|
|
+
|
|
+ ADD y0, a4, y0
|
|
+ MUL alpha3, a8, a8
|
|
+ ADD y1, a5, y1
|
|
+ MUL alpha3, a9, a9
|
|
+ ADD y2, a6, y2
|
|
+ MUL alpha3, a10, a10
|
|
+ ADD y3, a7, y3
|
|
+ MUL alpha3, a11, a11
|
|
+
|
|
+ ADD y0, a8, y0
|
|
+ MUL alpha4, a12, a12
|
|
+ ADD y1, a9, y1
|
|
+ MUL alpha4, a13, a13
|
|
+ ADD y2, a10, y2
|
|
+ MUL alpha4, a14, a14
|
|
+ ADD y3, a11, y3
|
|
+ MUL alpha4, a15, a15
|
|
+
|
|
+ ADD y0, a12, y0
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ ADD y1, a13, y1
|
|
+ unop
|
|
+
|
|
+ ADD y2, a14, y2
|
|
+ unop
|
|
+ ADD y3, a15, y3
|
|
+ unop
|
|
+
|
|
+ ST y0, -4 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y1, -3 * SIZE(Y1)
|
|
+ ldi A2, 4 * SIZE(A2)
|
|
+ ST y2, -2 * SIZE(Y1)
|
|
+ ldi A3, 4 * SIZE(A3)
|
|
+ ST y3, -1 * SIZE(Y1)
|
|
+ ldi A4, 4 * SIZE(A4)
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ and M, 2, I
|
|
+ ble I, $L17
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A3)
|
|
+ MUL alpha1, a0, a0
|
|
+ LD a5, 1 * SIZE(A3)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD a6, 0 * SIZE(A4)
|
|
+ MUL alpha2, a2, a2
|
|
+ LD a7, 1 * SIZE(A4)
|
|
+ MUL alpha2, a3, a3
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ MUL alpha3, a4, a4
|
|
+ ADD y1, a1, y1
|
|
+ MUL alpha3, a5, a5
|
|
+ ADD y0, a2, y0
|
|
+ MUL alpha4, a6, a6
|
|
+ ADD y1, a3, y1
|
|
+ MUL alpha4, a7, a7
|
|
+
|
|
+ ADD y0, a4, y0
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ ADD y1, a5, y1
|
|
+ ldi A2, 2 * SIZE(A2)
|
|
+ ADD y0, a6, y0
|
|
+ ldi A3, 2 * SIZE(A3)
|
|
+ ADD y1, a7, y1
|
|
+ ldi A4, 2 * SIZE(A4)
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi Y1, 2 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ blbc M, $L18
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD a2, 0 * SIZE(A3)
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ MUL alpha2, a1, a1
|
|
+ MUL alpha3, a2, a2
|
|
+ MUL alpha4, a3, a3
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ ADD y0, a1, y0
|
|
+ ADD y0, a2, y0
|
|
+ ADD y0, a3, y0
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and N, 2, J
|
|
+ ble J, $L30
|
|
+
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ mov A, A1
|
|
+ MUL alpha, alpha1, alpha1
|
|
+ addl A, LDA, A2
|
|
+ MUL alpha, alpha2, alpha2
|
|
+
|
|
+ addl A2, LDA, A
|
|
+ mov Y, Y1
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ MUL alpha1, a2, a2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ MUL alpha1, a3, a3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+ MUL alpha2, a4, a4
|
|
+
|
|
+ ADD y1, a1, y1
|
|
+ LD a1, 5 * SIZE(A1)
|
|
+ MUL alpha2, a5, a5
|
|
+
|
|
+ ADD y2, a2, y2
|
|
+ LD a2, 6 * SIZE(A1)
|
|
+ MUL alpha2, a6, a6
|
|
+
|
|
+ ADD y3, a3, y3
|
|
+ LD a3, 7 * SIZE(A1)
|
|
+ MUL alpha2, a7, a7
|
|
+
|
|
+ ADD y0, a4, y0
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+ MUL alpha1, a0, a0
|
|
+
|
|
+ ADD y1, a5, y1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+ MUL alpha1, a1, a1
|
|
+
|
|
+ ADD y2, a6, y2
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+ MUL alpha1, a2, a2
|
|
+
|
|
+ ADD y3, a7, y3
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+ MUL alpha1, a3, a3
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ ldi I, -1(I)
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD y4, a0, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a4, a4
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD y5, a1, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a5, a5
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD y6, a2, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a6, a6
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD y7, a3, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a7, a7
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD y4, a4, y4
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ MUL alpha1, a0, a0
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD y5, a5, y5
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+
|
|
+ ADD y6, a6, y6
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ MUL alpha1, a2, a2
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+
|
|
+ ADD y7, a7, y7
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+ MUL alpha1, a3, a3
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ MUL alpha2, a4, a4
|
|
+ LD a0, 12 * SIZE(A1)
|
|
+
|
|
+ ADD y1, a1, y1
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ MUL alpha2, a5, a5
|
|
+ LD a1, 13 * SIZE(A1)
|
|
+
|
|
+ ADD y2, a2, y2
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ MUL alpha2, a6, a6
|
|
+ LD a2, 14 * SIZE(A1)
|
|
+
|
|
+ ADD y3, a3, y3
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ MUL alpha2, a7, a7
|
|
+ LD a3, 15 * SIZE(A1)
|
|
+
|
|
+ ADD y0, a4, y0
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+ MUL alpha1, a0, a0
|
|
+ LD y4, 12 * SIZE(Y1)
|
|
+
|
|
+ ADD y1, a5, y1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD y5, 13 * SIZE(Y1)
|
|
+
|
|
+ ADD y2, a6, y2
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+ MUL alpha1, a2, a2
|
|
+ LD y6, 14 * SIZE(Y1)
|
|
+
|
|
+ ADD y3, a7, y3
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+ MUL alpha1, a3, a3
|
|
+ LD y7, 15 * SIZE(Y1)
|
|
+
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD y4, a0, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a4, a4
|
|
+ unop
|
|
+
|
|
+ ADD y5, a1, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a5, a5
|
|
+ unop
|
|
+
|
|
+ ADD y6, a2, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a6, a6
|
|
+ unop
|
|
+
|
|
+ ADD y7, a3, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a7, a7
|
|
+ unop
|
|
+
|
|
+ ADD y4, a4, y4
|
|
+ ADD y5, a5, y5
|
|
+ ADD y6, a6, y6
|
|
+ ADD y7, a7, y7
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and M, 4, I
|
|
+ ble I, $L26
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ MUL alpha1, a2, a2
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ MUL alpha1, a3, a3
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ MUL alpha2, a4, a4
|
|
+ ADD y1, a1, y1
|
|
+ MUL alpha2, a5, a5
|
|
+ ADD y2, a2, y2
|
|
+ MUL alpha2, a6, a6
|
|
+ ADD y3, a3, y3
|
|
+ MUL alpha2, a7, a7
|
|
+
|
|
+ ADD y0, a4, y0
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ ADD y1, a5, y1
|
|
+ unop
|
|
+ ADD y2, a6, y2
|
|
+ unop
|
|
+ ADD y3, a7, y3
|
|
+ unop
|
|
+
|
|
+ ST y0, -4 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y1, -3 * SIZE(Y1)
|
|
+ ldi A2, 4 * SIZE(A2)
|
|
+ ST y2, -2 * SIZE(Y1)
|
|
+ ldi A3, 4 * SIZE(A3)
|
|
+ ST y3, -1 * SIZE(Y1)
|
|
+ ldi A4, 4 * SIZE(A4)
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ and M, 2, I
|
|
+ ble I, $L27
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ MUL alpha1, a1, a1
|
|
+ MUL alpha2, a2, a2
|
|
+ MUL alpha2, a3, a3
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ ADD y1, a1, y1
|
|
+ ldi A2, 2 * SIZE(A2)
|
|
+ ADD y0, a2, y0
|
|
+ unop
|
|
+ ADD y1, a3, y1
|
|
+ unop
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi Y1, 2 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ blbc M, $L30
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ MUL alpha2, a1, a1
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ ADD y0, a1, y0
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ blbc N, $L990
|
|
+
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ mov A, A1
|
|
+ MUL alpha, alpha1, alpha1
|
|
+ mov Y, Y1
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L35
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+ LD a4, 4 * SIZE(A1)
|
|
+ LD a5, 5 * SIZE(A1)
|
|
+ LD a6, 6 * SIZE(A1)
|
|
+ LD a7, 7 * SIZE(A1)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ MUL alpha1, a1, a1
|
|
+ MUL alpha1, a2, a2
|
|
+ MUL alpha1, a3, a3
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L33
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD y0, a0, y0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ MUL alpha1, a4, a4
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD y1, a1, y1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ MUL alpha1, a5, a5
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD y2, a2, y2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ MUL alpha1, a6, a6
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD y3, a3, y3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ MUL alpha1, a7, a7
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD y4, a4, y4
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+ MUL alpha1, a0, a0
|
|
+ LD a4, 12 * SIZE(A1)
|
|
+
|
|
+ ADD y5, a5, y5
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD a5, 13 * SIZE(A1)
|
|
+
|
|
+ ADD y6, a6, y6
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+ MUL alpha1, a2, a2
|
|
+ LD a6, 14 * SIZE(A1)
|
|
+
|
|
+ ADD y7, a7, y7
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+ MUL alpha1, a3, a3
|
|
+ LD a7, 15 * SIZE(A1)
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ ldi I, -1(I)
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ bgt I, $L32
|
|
+ .align 4
|
|
+
|
|
+$L33:
|
|
+ ADD y0, a0, y0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ MUL alpha1, a4, a4
|
|
+ unop
|
|
+
|
|
+ ADD y1, a1, y1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ MUL alpha1, a5, a5
|
|
+ unop
|
|
+
|
|
+ ADD y2, a2, y2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ MUL alpha1, a6, a6
|
|
+ unop
|
|
+
|
|
+ ADD y3, a3, y3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ MUL alpha1, a7, a7
|
|
+ unop
|
|
+
|
|
+ ADD y4, a4, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ADD y5, a5, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ADD y6, a6, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ ADD y7, a7, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ unop
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ and M, 4, I
|
|
+ ble I, $L36
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, a1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ MUL alpha1, a2, a2
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ MUL alpha1, a3, a3
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ ADD y1, a1, y1
|
|
+ ADD y2, a2, y2
|
|
+ ADD y3, a3, y3
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi A2, 4 * SIZE(A2)
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L36:
|
|
+ and M, 2, I
|
|
+ ble I, $L37
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a0, a0
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ MUL alpha1, a1, a1
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ ADD y1, a1, y1
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi Y1, 2 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ blbc M, $L990
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, a0
|
|
+
|
|
+ ADD y0, a0, y0
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L990:
|
|
+ cmpeq INCY, SIZE, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ mov BUFFER, Y1
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L995
|
|
+ .align 4
|
|
+
|
|
+$L992:
|
|
+ LD a0, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a1, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a2, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a3, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y0, 0 * SIZE(Y)
|
|
+ LD y1, 1 * SIZE(Y)
|
|
+ LD y2, 2 * SIZE(Y)
|
|
+ LD y3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a5, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a6, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a7, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y4, 4 * SIZE(Y)
|
|
+ LD y5, 5 * SIZE(Y)
|
|
+ LD y6, 6 * SIZE(Y)
|
|
+ LD y7, 7 * SIZE(Y)
|
|
+
|
|
+ ADD a0, y0, a0
|
|
+ ADD a1, y1, a1
|
|
+ ADD a2, y2, a2
|
|
+ ADD a3, y3, a3
|
|
+ ADD a4, y4, a4
|
|
+ ADD a5, y5, a5
|
|
+ ADD a6, y6, a6
|
|
+ ADD a7, y7, a7
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a1, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a2, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a3, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ST a4, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a5, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a6, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a7, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ldi Y, 8 * SIZE(Y)
|
|
+ bgt I, $L992
|
|
+ .align 4
|
|
+
|
|
+$L995:
|
|
+ and M, 7, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L996:
|
|
+ LD a0, 0 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y0, 0 * SIZE(Y)
|
|
+ ldi Y, 1 * SIZE(Y)
|
|
+
|
|
+ ADD a0, y0, a0
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L996
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/gemv_t.S b/kernel/sw_64/gemv_t.S
|
|
new file mode 100644
|
|
index 0000000..4d8f130
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/gemv_t.S
|
|
@@ -0,0 +1,1222 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define STACKSIZE 72
|
|
+#define PREFETCHSIZE 32
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define A $20
|
|
+#define LDA $21
|
|
+
|
|
+#define X $18
|
|
+#define INCX $19
|
|
+#define Y $22
|
|
+#define INCY $23
|
|
+
|
|
+#define BUFFER $24
|
|
+
|
|
+#define I $25
|
|
+#define J $27
|
|
+
|
|
+#define X1 $3
|
|
+#define Y1 $4
|
|
+
|
|
+#define A1 $5
|
|
+#define A2 $6
|
|
+#define A3 $7
|
|
+#define A4 $8
|
|
+
|
|
+#define alpha $f19
|
|
+#define f20 $f20
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f21
|
|
+
|
|
+#define a0 $f22
|
|
+#define a1 $f23
|
|
+#define a2 $f24
|
|
+#define a3 $f25
|
|
+#define a4 $f26
|
|
+#define a5 $f27
|
|
+#define a6 $f28
|
|
+#define a7 $f29
|
|
+
|
|
+#define a8 $f2
|
|
+#define a9 $f3
|
|
+#define a10 $f4
|
|
+#define a11 $f5
|
|
+#define a12 $f6
|
|
+#define a13 $f7
|
|
+#define a14 $f8
|
|
+#define a15 $f9
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ ldl X, 0 + STACKSIZE($sp)
|
|
+ ldl INCX, 8 + STACKSIZE($sp)
|
|
+ ldl Y, 16 + STACKSIZE($sp)
|
|
+ ldl INCY, 24 + STACKSIZE($sp)
|
|
+ ldl BUFFER, 32 + STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ fstd f20, 64($sp)
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ cmple N, 0, $1
|
|
+ SXADDQ INCY, 0, INCY
|
|
+
|
|
+ or $0, $1, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ cmpeq INCX, SIZE, $0
|
|
+ mov X, X1
|
|
+ SXADDQ LDA, 0, LDA
|
|
+ bne $0, $L10
|
|
+
|
|
+ sra M, 3, I
|
|
+ mov BUFFER, Y1
|
|
+ mov BUFFER, X
|
|
+ ble I, $L05
|
|
+ .align 4
|
|
+
|
|
+$L02:
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ LD a0, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a1, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a2, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a3, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ ST a2, 2 * SIZE(Y1)
|
|
+ ST a3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a4, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a5, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a6, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a7, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a4, 4 * SIZE(Y1)
|
|
+ ST a5, 5 * SIZE(Y1)
|
|
+ ST a6, 6 * SIZE(Y1)
|
|
+ ST a7, 7 * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ bgt I, $L02
|
|
+ .align 4
|
|
+
|
|
+$L05:
|
|
+ and M, 7, I
|
|
+ ble I, $L10
|
|
+ .align 4
|
|
+
|
|
+$L06:
|
|
+ LD a0, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ addl Y1, SIZE, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L06
|
|
+ .align 4
|
|
+
|
|
+$L10:
|
|
+ mov Y, Y1
|
|
+ fclr t0
|
|
+ unop
|
|
+ fclr t1
|
|
+
|
|
+ sra N, 2, J
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ ble J, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ mov A, A1
|
|
+ fclr s0
|
|
+ addl A, LDA, A2
|
|
+ fclr s1
|
|
+
|
|
+ addl A2, LDA, A3
|
|
+ fclr s2
|
|
+ addl A3, LDA, A4
|
|
+ fclr s3
|
|
+
|
|
+ s4addl LDA, A, A
|
|
+ unop
|
|
+ mov X, X1
|
|
+ flds $f31, 3 * SIZE(Y)
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD a2, 0 * SIZE(A3)
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+ LD a4, 1 * SIZE(A1)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 1 * SIZE(A3)
|
|
+ LD a7, 1 * SIZE(A4)
|
|
+ LD a8, 2 * SIZE(A1)
|
|
+ LD a9, 2 * SIZE(A2)
|
|
+ LD a10, 2 * SIZE(A3)
|
|
+ LD a11, 2 * SIZE(A4)
|
|
+ LD a12, 3 * SIZE(A1)
|
|
+ LD a13, 3 * SIZE(A2)
|
|
+ LD a14, 3 * SIZE(A3)
|
|
+ LD a15, 3 * SIZE(A4)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 4 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ unop
|
|
+ MUL x0, a2, t2
|
|
+ LD a2, 4 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20, s3
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD a3, 4 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+ MUL x1, a4, t0
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20, s1
|
|
+ LD a4, 5 * SIZE(A1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x1, a5, t1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ #unop
|
|
+ MUL x1, a6, t2
|
|
+ LD a6, 5 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ #unop
|
|
+ MUL x1, a7, t3
|
|
+ LD a7, 5 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+ MUL x2, a8, t0
|
|
+ LD a8, -2 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+ MUL x2, a9, t1
|
|
+ LD a9, 6 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ MUL x2, a10, t2
|
|
+ LD a10, 6 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ ldi A3, 8 * SIZE(A3)
|
|
+ MUL x2, a11, t3
|
|
+ LD a11, 6 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+ MUL x3, a12, t0
|
|
+ LD a12, -1 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldi A4, 8 * SIZE(A4)
|
|
+ MUL x3, a13, t1
|
|
+ LD a13, -1 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ unop
|
|
+ MUL x3, a14, t2
|
|
+ LD a14, -1 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ unop
|
|
+ MUL x3, a15, t3
|
|
+ LD a15, -1 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldw $31, (PREFETCHSIZE - 8) * SIZE(A3)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ unop
|
|
+ MUL x0, a2, t2
|
|
+ LD a2, 0 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x0, 8 * SIZE(X1)
|
|
+ MUL x1, a4, t0
|
|
+ LD a4, 1 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ unop
|
|
+ MUL x1, a5, t1
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ unop
|
|
+ MUL x1, a6, t2
|
|
+ LD a6, 1 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ unop
|
|
+ MUL x1, a7, t3
|
|
+ LD a7, 1 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x1, 9 * SIZE(X1)
|
|
+ MUL x2, a8, t0
|
|
+ LD a8, 2 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldw $31, (PREFETCHSIZE - 8) * SIZE(A4)
|
|
+ MUL x2, a9, t1
|
|
+ LD a9, 2 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x2, a10, t2
|
|
+ LD a10, 2 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ ldi I, -1(I)
|
|
+ MUL x2, a11, t3
|
|
+ LD a11, 2 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+ MUL x3, a12, t0
|
|
+ LD a12, 3 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldw $31, (PREFETCHSIZE - 8) * SIZE(X1)
|
|
+ MUL x3, a13, t1
|
|
+ LD a13, 3 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ unop
|
|
+ MUL x3, a14, t2
|
|
+ LD a14, 3 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ MUL x3, a15, t3
|
|
+ LD a15, 3 * SIZE(A4)
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ #unop
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 4 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ #unop
|
|
+ MUL x0, a2, t2
|
|
+ LD a2, 4 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ #unop
|
|
+ MUL x0, a3, t3
|
|
+ LD a3, 4 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, x0
|
|
+ fmov x0,s0
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+ MUL x1, a4, t0
|
|
+ LD a4, 5 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ #unop
|
|
+ MUL x1, a5, t1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ #unop
|
|
+ MUL x1, a6, t2
|
|
+ LD a6, 5 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ #unop
|
|
+ MUL x1, a7, t3
|
|
+ LD a7, 5 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+ MUL x2, a8, t0
|
|
+ LD a8, 6 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ #unop
|
|
+ MUL x2, a9, t1
|
|
+ LD a9, 6 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ #unop
|
|
+ MUL x2, a10, t2
|
|
+ LD a10, 6 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ #unop
|
|
+ MUL x2, a11, t3
|
|
+ LD a11, 6 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+ MUL x3, a12, t0
|
|
+ LD a12, 7 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x3, a13, t1
|
|
+ LD a13, 7 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ MUL x3, a14, t2
|
|
+ LD a14, 7 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ ldi A3, 8 * SIZE(A3)
|
|
+ MUL x3, a15, t3
|
|
+ LD a15, 7 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ unop
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x0, a1, t1
|
|
+ ldi A4, 8 * SIZE(A4)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ MUL x0, a2, t2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ MUL x0, a3, t3
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ MUL x1, a4, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x1, a5, t1
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ MUL x1, a6, t2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ MUL x1, a7, t3
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ MUL x2, a8, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x2, a9, t1
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ MUL x2, a10, t2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ MUL x2, a11, t3
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ MUL x3, a12, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x3, a13, t1
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ MUL x3, a14, t2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ MUL x3, a15, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and M, 7, I
|
|
+ ble I, $L18
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD a2, 0 * SIZE(A3)
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ ADD s0, t0,f20
|
|
+ fmov f20,s0
|
|
+ ldi A4, 1 * SIZE(A4)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 1 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldi A1, 1 * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 1 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ ldi A2, 1 * SIZE(A2)
|
|
+ MUL x0, a2, t2
|
|
+ LD a2, 1 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ ldi A3, 1 * SIZE(A3)
|
|
+ MUL x0, a3, t3
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+
|
|
+ LD x0, 1 * SIZE(X1)
|
|
+ ldi X1, 1 * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0,f20
|
|
+ fmov f20,s0
|
|
+ MUL x0, a0, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x0, a1, t1
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ MUL x0, a2, t2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ MUL x0, a3, t3
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+ LD a1, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+ LD a2, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+ LD a3, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+
|
|
+ ADD s0, t0,f20
|
|
+ fmov f20,s0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+
|
|
+ MUL alpha, s0,f20
|
|
+ fmov f20,s0
|
|
+ MUL alpha, s1, f20
|
|
+ fmov f20,s1
|
|
+ MUL alpha, s2, f20
|
|
+ fmov f20,s2
|
|
+ MUL alpha, s3, f20
|
|
+ fmov f20,s3
|
|
+
|
|
+ ADD a0, s0,f20
|
|
+ fmov f20,a0
|
|
+ fclr t0
|
|
+ ADD a1, s1, f20
|
|
+ fmov f20,a1
|
|
+ fclr t1
|
|
+ ADD a2, s2, f20
|
|
+ fmov f20,a2
|
|
+ fclr t2
|
|
+ ADD a3, s3, f20
|
|
+ fmov f20,a3
|
|
+ fclr t3
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a1, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a2, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a3, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and N, 2, J
|
|
+ ble J, $L30
|
|
+ mov A, A1
|
|
+ addl A, LDA, A2
|
|
+
|
|
+ addl A2, LDA, A
|
|
+ fclr s0
|
|
+ mov X, X1
|
|
+ fclr s1
|
|
+
|
|
+ sra M, 3, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD a2, 1 * SIZE(A1)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+ LD a4, 2 * SIZE(A1)
|
|
+ LD a5, 2 * SIZE(A2)
|
|
+ LD a6, 3 * SIZE(A1)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD a8, 4 * SIZE(A1)
|
|
+ LD a9, 4 * SIZE(A2)
|
|
+ LD a10, 5 * SIZE(A1)
|
|
+ LD a11, 5 * SIZE(A2)
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+ LD a13, 6 * SIZE(A2)
|
|
+ LD a14, 7 * SIZE(A1)
|
|
+ LD a15, 7 * SIZE(A2)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD s0, t0, x3
|
|
+ fmov x3,s0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 8 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t2, x0
|
|
+ fmov x0,s0
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+ MUL x1, a2, t2
|
|
+ LD a2, 9 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t3, f20
|
|
+ fmov f20,s1
|
|
+ #unop
|
|
+ MUL x1, a3, t3
|
|
+ LD a3, 9 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+ MUL x2, a4, t0
|
|
+ LD a4, 10 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldi I, -1(I)
|
|
+ MUL x2, a5, t1
|
|
+ LD a5, 10 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t2, f20
|
|
+ fmov f20,s0
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+ MUL x3, a6, t2
|
|
+ LD a6, 11 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t3, f20
|
|
+ fmov f20,s1
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x3, a7, t3
|
|
+ LD a7, 11 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x3, -1 * SIZE(X1)
|
|
+ MUL x0, a8, t0
|
|
+ LD a8, 12 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+ MUL x0, a9, t1
|
|
+ LD a9, 12 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ MUL x1, a10, t0
|
|
+ LD a10, 13 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x1, a11, t1
|
|
+ LD a11, 13 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ MUL x2, a12, t0
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x2, a13, t1
|
|
+ LD a13, 14 * SIZE(A2)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+ MUL x3, a14, t0
|
|
+ LD a14, 7 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x3, a15, t1
|
|
+ LD a15, 7 * SIZE(A2)
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD s0, t2, f20
|
|
+ fmov f20,s0
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+ MUL x1, a2, t2
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD s1, t3, f20
|
|
+ fmov f20,s1
|
|
+ unop
|
|
+ MUL x1, a3, t3
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+ MUL x2, a4, t0
|
|
+ unop
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ unop
|
|
+ MUL x2, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD s0, t2, f20
|
|
+ fmov f20,s0
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+ MUL x3, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s1, t3, f20
|
|
+ fmov f20,s1
|
|
+ unop
|
|
+ MUL x3, a7, t3
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+ MUL x0, a8, t0
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ unop
|
|
+ MUL x0, a9, t1
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ MUL x1, a10, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x1, a11, t1
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ MUL x2, a12, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x2, a13, t1
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ MUL x3, a14, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x3, a15, t1
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and M, 7, I
|
|
+ ble I, $L28
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L27
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ ADD s0, t0,f20
|
|
+ fmov f20,s0
|
|
+ ldi A2, 1 * SIZE(A2)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 1 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1,f20
|
|
+ fmov f20,s1
|
|
+ ldi A1, 1 * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+
|
|
+ LD x0, 1 * SIZE(X1)
|
|
+ ldi X1, 1 * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ MUL x0, a0, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL x0, a1, t1
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+ LD a1, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+
|
|
+ ADD s0, s2, f20
|
|
+ fmov f20,s0
|
|
+ ADD s1, s3, f20
|
|
+ fmov f20,s1
|
|
+
|
|
+ MUL alpha, s0, f20
|
|
+ fmov f20,s0
|
|
+ MUL alpha, s1,f20
|
|
+ fmov f20,s1
|
|
+
|
|
+ ADD a0, s0, f20
|
|
+ fmov f20,a0
|
|
+ ADD a1, s1, f20
|
|
+ fmov f20,a1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ fclr t0
|
|
+ addl Y1, INCY, Y1
|
|
+ fclr t1
|
|
+
|
|
+ ST a1, 0 * SIZE(Y1)
|
|
+ fclr t2
|
|
+ addl Y1, INCY, Y1
|
|
+ fclr t3
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ blbc N, $L999
|
|
+
|
|
+ mov A, A1
|
|
+ fclr s0
|
|
+ mov X, X1
|
|
+ fclr s1
|
|
+
|
|
+ sra M, 3, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $L35
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a8, 0 * SIZE(X1)
|
|
+ LD a9, 1 * SIZE(X1)
|
|
+
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+ LD a10, 2 * SIZE(X1)
|
|
+ LD a11, 3 * SIZE(X1)
|
|
+
|
|
+ LD a4, 4 * SIZE(A1)
|
|
+ LD a5, 5 * SIZE(A1)
|
|
+ LD a12, 4 * SIZE(X1)
|
|
+ LD a13, 5 * SIZE(X1)
|
|
+
|
|
+ LD a6, 6 * SIZE(A1)
|
|
+ LD a7, 7 * SIZE(A1)
|
|
+ LD a14, 6 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L33
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD a15, 7 * SIZE(X1)
|
|
+ MUL a0, a8, f20
|
|
+ fmov f20,t0
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ LD a8, 8 * SIZE(X1)
|
|
+ MUL a1, a9, t1
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ LD a9, 9 * SIZE(X1)
|
|
+ MUL a2, a10, t2
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ LD a10, 10 * SIZE(X1)
|
|
+ MUL a3, a11, t3
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD a11, 11 * SIZE(X1)
|
|
+ MUL a4, a12, t0
|
|
+ LD a4, 12 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ LD a12, 12 * SIZE(X1)
|
|
+ MUL a5, a13, t1
|
|
+ LD a5, 13 * SIZE(A1)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ LD a13, 13 * SIZE(X1)
|
|
+ MUL a6, a14, t2
|
|
+ LD a6, 14 * SIZE(A1)
|
|
+
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ LD a14, 14 * SIZE(X1)
|
|
+ MUL a7, a15, t3
|
|
+ LD a7, 15 * SIZE(A1)
|
|
+
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ldi I, -1(I)
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ bgt I, $L32
|
|
+ .align 4
|
|
+
|
|
+$L33:
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ LD a15, 7 * SIZE(X1)
|
|
+ MUL a0, a8, t0
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ unop
|
|
+ MUL a1, a9, t1
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ MUL a2, a10, t2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ MUL a3, a11, t3
|
|
+
|
|
+ ADD s0, t0, f20
|
|
+ fmov f20,s0
|
|
+ MUL a4, a12, t0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ MUL a5, a13, t1
|
|
+
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ MUL a6, a14, t2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+ MUL a7, a15, t3
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ and M, 7, I
|
|
+ ble I, $L38
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L37
|
|
+ .align 4
|
|
+
|
|
+$L36:
|
|
+ ADD s0, t0,f20
|
|
+ fmov f20,s0
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 1 * SIZE(A1)
|
|
+ LD x0, 1 * SIZE(X1)
|
|
+
|
|
+ ldi A1, 1 * SIZE(A1)
|
|
+ ldi X1, 1 * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L36
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ ADD s0, t0,f20
|
|
+ fmov f20,s0
|
|
+ MUL x0, a0, t0
|
|
+ .align 4
|
|
+
|
|
+$L38:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+
|
|
+ ADD s0, t0,f20
|
|
+ fmov f20,s0
|
|
+ ADD s1, t1, f20
|
|
+ fmov f20,s1
|
|
+ ADD s2, t2, f20
|
|
+ fmov f20,s2
|
|
+ ADD s3, t3, f20
|
|
+ fmov f20,s3
|
|
+
|
|
+ ADD s0, s2, f20
|
|
+ fmov f20,s0
|
|
+ ADD s1, s3, f20
|
|
+ fmov f20,s1
|
|
+ ADD s0, s1, f20
|
|
+ fmov f20,s0
|
|
+
|
|
+ MUL alpha, s0, f20
|
|
+ fmov f20,s0
|
|
+ ADD a0, s0, f20
|
|
+ fmov f20,a0
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ fldd f20, 64($sp)
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/gemv_t.S.bak b/kernel/sw_64/gemv_t.S.bak
|
|
new file mode 100644
|
|
index 0000000..068e463
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/gemv_t.S.bak
|
|
@@ -0,0 +1,1061 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define STACKSIZE 64
|
|
+#define PREFETCHSIZE 32
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define A $20
|
|
+#define LDA $21
|
|
+
|
|
+#define X $18
|
|
+#define INCX $19
|
|
+#define Y $22
|
|
+#define INCY $23
|
|
+
|
|
+#define BUFFER $24
|
|
+
|
|
+#define I $25
|
|
+#define J $27
|
|
+
|
|
+#define X1 $3
|
|
+#define Y1 $4
|
|
+
|
|
+#define A1 $5
|
|
+#define A2 $6
|
|
+#define A3 $7
|
|
+#define A4 $8
|
|
+
|
|
+#define alpha $f19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f21
|
|
+
|
|
+#define a0 $f22
|
|
+#define a1 $f23
|
|
+#define a2 $f24
|
|
+#define a3 $f25
|
|
+#define a4 $f26
|
|
+#define a5 $f27
|
|
+#define a6 $f28
|
|
+#define a7 $f29
|
|
+
|
|
+#define a8 $f2
|
|
+#define a9 $f3
|
|
+#define a10 $f4
|
|
+#define a11 $f5
|
|
+#define a12 $f6
|
|
+#define a13 $f7
|
|
+#define a14 $f8
|
|
+#define a15 $f9
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ ldl X, 0 + STACKSIZE($sp)
|
|
+ ldl INCX, 8 + STACKSIZE($sp)
|
|
+ ldl Y, 16 + STACKSIZE($sp)
|
|
+ ldl INCY, 24 + STACKSIZE($sp)
|
|
+ ldl BUFFER, 32 + STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ cmple N, 0, $1
|
|
+ SXADDQ INCY, 0, INCY
|
|
+
|
|
+ or $0, $1, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ cmpeq INCX, SIZE, $0
|
|
+ mov X, X1
|
|
+ SXADDQ LDA, 0, LDA
|
|
+ bne $0, $L10
|
|
+
|
|
+ sra M, 3, I
|
|
+ mov BUFFER, Y1
|
|
+ mov BUFFER, X
|
|
+ ble I, $L05
|
|
+ .align 4
|
|
+
|
|
+$L02:
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ LD a0, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a1, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a2, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a3, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ ST a2, 2 * SIZE(Y1)
|
|
+ ST a3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a4, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a5, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a6, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a7, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a4, 4 * SIZE(Y1)
|
|
+ ST a5, 5 * SIZE(Y1)
|
|
+ ST a6, 6 * SIZE(Y1)
|
|
+ ST a7, 7 * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ bgt I, $L02
|
|
+ .align 4
|
|
+
|
|
+$L05:
|
|
+ and M, 7, I
|
|
+ ble I, $L10
|
|
+ .align 4
|
|
+
|
|
+$L06:
|
|
+ LD a0, 0 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ addl Y1, SIZE, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L06
|
|
+ .align 4
|
|
+
|
|
+$L10:
|
|
+ mov Y, Y1
|
|
+ fclr t0
|
|
+ unop
|
|
+ fclr t1
|
|
+
|
|
+ sra N, 2, J
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ ble J, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ mov A, A1
|
|
+ fclr s0
|
|
+ addl A, LDA, A2
|
|
+ fclr s1
|
|
+
|
|
+ addl A2, LDA, A3
|
|
+ fclr s2
|
|
+ addl A3, LDA, A4
|
|
+ fclr s3
|
|
+
|
|
+ s4addl LDA, A, A
|
|
+ unop
|
|
+ mov X, X1
|
|
+ fillcs 3 * SIZE(Y)
|
|
+
|
|
+ sra M, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD a2, 0 * SIZE(A3)
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+ LD a4, 1 * SIZE(A1)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 1 * SIZE(A3)
|
|
+ LD a7, 1 * SIZE(A4)
|
|
+ LD a8, 2 * SIZE(A1)
|
|
+ LD a9, 2 * SIZE(A2)
|
|
+ LD a10, 2 * SIZE(A3)
|
|
+ LD a11, 2 * SIZE(A4)
|
|
+ LD a12, 3 * SIZE(A1)
|
|
+ LD a13, 3 * SIZE(A2)
|
|
+ LD a14, 3 * SIZE(A3)
|
|
+ LD a15, 3 * SIZE(A4)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, s0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 4 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x0, a2, t2
|
|
+ LD a2, 4 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD a3, 4 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+ MUL x1, a4, t0
|
|
+ LD a4, 5 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x1, a5, t1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x1, a6, t2
|
|
+ LD a6, 5 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ unop
|
|
+ MUL x1, a7, t3
|
|
+ LD a7, 5 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+ MUL x2, a8, t0
|
|
+ LD a8, -2 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+ MUL x2, a9, t1
|
|
+ LD a9, 6 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ MUL x2, a10, t2
|
|
+ LD a10, 6 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ ldi A3, 8 * SIZE(A3)
|
|
+ MUL x2, a11, t3
|
|
+ LD a11, 6 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+ MUL x3, a12, t0
|
|
+ LD a12, -1 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ldi A4, 8 * SIZE(A4)
|
|
+ MUL x3, a13, t1
|
|
+ LD a13, -1 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x3, a14, t2
|
|
+ LD a14, -1 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ unop
|
|
+ MUL x3, a15, t3
|
|
+ LD a15, -1 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE - 8) * SIZE(A3)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x0, a2, t2
|
|
+ LD a2, 0 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x0, 8 * SIZE(X1)
|
|
+ MUL x1, a4, t0
|
|
+ LD a4, 1 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ unop
|
|
+ MUL x1, a5, t1
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x1, a6, t2
|
|
+ LD a6, 1 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ unop
|
|
+ MUL x1, a7, t3
|
|
+ LD a7, 1 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x1, 9 * SIZE(X1)
|
|
+ MUL x2, a8, t0
|
|
+ LD a8, 2 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE - 8) * SIZE(A4)
|
|
+ MUL x2, a9, t1
|
|
+ LD a9, 2 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x2, a10, t2
|
|
+ LD a10, 2 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ ldi I, -1(I)
|
|
+ MUL x2, a11, t3
|
|
+ LD a11, 2 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+ MUL x3, a12, t0
|
|
+ LD a12, 3 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE - 8) * SIZE(X1)
|
|
+ MUL x3, a13, t1
|
|
+ LD a13, 3 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x3, a14, t2
|
|
+ LD a14, 3 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ MUL x3, a15, t3
|
|
+ LD a15, 3 * SIZE(A4)
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, s0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 4 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x0, a2, t2
|
|
+ LD a2, 4 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD a3, 4 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+ MUL x1, a4, t0
|
|
+ LD a4, 5 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ unop
|
|
+ MUL x1, a5, t1
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x1, a6, t2
|
|
+ LD a6, 5 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ unop
|
|
+ MUL x1, a7, t3
|
|
+ LD a7, 5 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+ MUL x2, a8, t0
|
|
+ LD a8, 6 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ unop
|
|
+ MUL x2, a9, t1
|
|
+ LD a9, 6 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL x2, a10, t2
|
|
+ LD a10, 6 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ unop
|
|
+ MUL x2, a11, t3
|
|
+ LD a11, 6 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+ MUL x3, a12, t0
|
|
+ LD a12, 7 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x3, a13, t1
|
|
+ LD a13, 7 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ MUL x3, a14, t2
|
|
+ LD a14, 7 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ ldi A3, 8 * SIZE(A3)
|
|
+ MUL x3, a15, t3
|
|
+ LD a15, 7 * SIZE(A4)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ unop
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x0, a1, t1
|
|
+ ldi A4, 8 * SIZE(A4)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL x0, a2, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL x0, a3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL x1, a4, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL x1, a5, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL x1, a6, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL x1, a7, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL x2, a8, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL x2, a9, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL x2, a10, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL x2, a11, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL x3, a12, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL x3, a13, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL x3, a14, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL x3, a15, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and M, 7, I
|
|
+ ble I, $L18
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD a2, 0 * SIZE(A3)
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ ADD s0, t0, s0
|
|
+ ldi A4, 1 * SIZE(A4)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 1 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ldi A1, 1 * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 1 * SIZE(A2)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ ldi A2, 1 * SIZE(A2)
|
|
+ MUL x0, a2, t2
|
|
+ LD a2, 1 * SIZE(A3)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ ldi A3, 1 * SIZE(A3)
|
|
+ MUL x0, a3, t3
|
|
+ LD a3, 0 * SIZE(A4)
|
|
+
|
|
+ LD x0, 1 * SIZE(X1)
|
|
+ ldi X1, 1 * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, s0
|
|
+ MUL x0, a0, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL x0, a1, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL x0, a2, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL x0, a3, t3
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+ LD a1, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+ LD a2, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+ LD a3, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t1, s1
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+ MUL alpha, s0, s0
|
|
+ MUL alpha, s1, s1
|
|
+ MUL alpha, s2, s2
|
|
+ MUL alpha, s3, s3
|
|
+
|
|
+ ADD a0, s0, a0
|
|
+ fclr t0
|
|
+ ADD a1, s1, a1
|
|
+ fclr t1
|
|
+ ADD a2, s2, a2
|
|
+ fclr t2
|
|
+ ADD a3, s3, a3
|
|
+ fclr t3
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a1, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a2, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a3, 0 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and N, 2, J
|
|
+ ble J, $L30
|
|
+ mov A, A1
|
|
+ addl A, LDA, A2
|
|
+
|
|
+ addl A2, LDA, A
|
|
+ fclr s0
|
|
+ mov X, X1
|
|
+ fclr s1
|
|
+
|
|
+ sra M, 3, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD a2, 1 * SIZE(A1)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+ LD a4, 2 * SIZE(A1)
|
|
+ LD a5, 2 * SIZE(A2)
|
|
+ LD a6, 3 * SIZE(A1)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD a8, 4 * SIZE(A1)
|
|
+ LD a9, 4 * SIZE(A2)
|
|
+ LD a10, 5 * SIZE(A1)
|
|
+ LD a11, 5 * SIZE(A2)
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+ LD a13, 6 * SIZE(A2)
|
|
+ LD a14, 7 * SIZE(A1)
|
|
+ LD a15, 7 * SIZE(A2)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD s0, t0, s0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 8 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t2, s0
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+ MUL x1, a2, t2
|
|
+ LD a2, 9 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t3, s1
|
|
+ unop
|
|
+ MUL x1, a3, t3
|
|
+ LD a3, 9 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+ MUL x2, a4, t0
|
|
+ LD a4, 10 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ldi I, -1(I)
|
|
+ MUL x2, a5, t1
|
|
+ LD a5, 10 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t2, s0
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+ MUL x3, a6, t2
|
|
+ LD a6, 11 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t3, s1
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x3, a7, t3
|
|
+ LD a7, 11 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x3, -1 * SIZE(X1)
|
|
+ MUL x0, a8, t0
|
|
+ LD a8, 12 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+ MUL x0, a9, t1
|
|
+ LD a9, 12 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ MUL x1, a10, t0
|
|
+ LD a10, 13 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x1, a11, t1
|
|
+ LD a11, 13 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ MUL x2, a12, t0
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ MUL x2, a13, t1
|
|
+ LD a13, 14 * SIZE(A2)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+ MUL x3, a14, t0
|
|
+ LD a14, 7 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ MUL x3, a15, t1
|
|
+ LD a15, 7 * SIZE(A2)
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD s0, t0, s0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+ MUL x0, a0, t0
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD s0, t2, s0
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+ MUL x1, a2, t2
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD s1, t3, s1
|
|
+ unop
|
|
+ MUL x1, a3, t3
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+ MUL x2, a4, t0
|
|
+ unop
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ unop
|
|
+ MUL x2, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD s0, t2, s0
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+ MUL x3, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s1, t3, s1
|
|
+ unop
|
|
+ MUL x3, a7, t3
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+ MUL x0, a8, t0
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ unop
|
|
+ MUL x0, a9, t1
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL x1, a10, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL x1, a11, t1
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL x2, a12, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL x2, a13, t1
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL x3, a14, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL x3, a15, t1
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and M, 7, I
|
|
+ ble I, $L28
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L27
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ ADD s0, t0, s0
|
|
+ ldi A2, 1 * SIZE(A2)
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 1 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ ldi A1, 1 * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ LD a1, 0 * SIZE(A2)
|
|
+
|
|
+ LD x0, 1 * SIZE(X1)
|
|
+ ldi X1, 1 * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD s0, t0, s0
|
|
+ MUL x0, a0, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL x0, a1, t1
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+ LD a1, 0 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t1, s1
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+ ADD s0, s2, s0
|
|
+ ADD s1, s3, s1
|
|
+
|
|
+ MUL alpha, s0, s0
|
|
+ MUL alpha, s1, s1
|
|
+
|
|
+ ADD a0, s0, a0
|
|
+ ADD a1, s1, a1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ fclr t0
|
|
+ addl Y1, INCY, Y1
|
|
+ fclr t1
|
|
+
|
|
+ ST a1, 0 * SIZE(Y1)
|
|
+ fclr t2
|
|
+ addl Y1, INCY, Y1
|
|
+ fclr t3
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ blbc N, $L999
|
|
+
|
|
+ mov A, A1
|
|
+ fclr s0
|
|
+ mov X, X1
|
|
+ fclr s1
|
|
+
|
|
+ sra M, 3, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $L35
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a8, 0 * SIZE(X1)
|
|
+ LD a9, 1 * SIZE(X1)
|
|
+
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+ LD a10, 2 * SIZE(X1)
|
|
+ LD a11, 3 * SIZE(X1)
|
|
+
|
|
+ LD a4, 4 * SIZE(A1)
|
|
+ LD a5, 5 * SIZE(A1)
|
|
+ LD a12, 4 * SIZE(X1)
|
|
+ LD a13, 5 * SIZE(X1)
|
|
+
|
|
+ LD a6, 6 * SIZE(A1)
|
|
+ LD a7, 7 * SIZE(A1)
|
|
+ LD a14, 6 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L33
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD s0, t0, s0
|
|
+ LD a15, 7 * SIZE(X1)
|
|
+ MUL a0, a8, t0
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a8, 8 * SIZE(X1)
|
|
+ MUL a1, a9, t1
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a9, 9 * SIZE(X1)
|
|
+ MUL a2, a10, t2
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a10, 10 * SIZE(X1)
|
|
+ MUL a3, a11, t3
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD a11, 11 * SIZE(X1)
|
|
+ MUL a4, a12, t0
|
|
+ LD a4, 12 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a12, 12 * SIZE(X1)
|
|
+ MUL a5, a13, t1
|
|
+ LD a5, 13 * SIZE(A1)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a13, 13 * SIZE(X1)
|
|
+ MUL a6, a14, t2
|
|
+ LD a6, 14 * SIZE(A1)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a14, 14 * SIZE(X1)
|
|
+ MUL a7, a15, t3
|
|
+ LD a7, 15 * SIZE(A1)
|
|
+
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ldi I, -1(I)
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ bgt I, $L32
|
|
+ .align 4
|
|
+
|
|
+$L33:
|
|
+ ADD s0, t0, s0
|
|
+ LD a15, 7 * SIZE(X1)
|
|
+ MUL a0, a8, t0
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ unop
|
|
+ MUL a1, a9, t1
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a2, a10, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a3, a11, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a4, a12, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a5, a13, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a6, a14, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a7, a15, t3
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ and M, 7, I
|
|
+ ble I, $L38
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L37
|
|
+ .align 4
|
|
+
|
|
+$L36:
|
|
+ ADD s0, t0, s0
|
|
+ MUL x0, a0, t0
|
|
+ LD a0, 1 * SIZE(A1)
|
|
+ LD x0, 1 * SIZE(X1)
|
|
+
|
|
+ ldi A1, 1 * SIZE(A1)
|
|
+ ldi X1, 1 * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L36
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ ADD s0, t0, s0
|
|
+ MUL x0, a0, t0
|
|
+ .align 4
|
|
+
|
|
+$L38:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t1, s1
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+ ADD s0, s2, s0
|
|
+ ADD s1, s3, s1
|
|
+ ADD s0, s1, s0
|
|
+
|
|
+ MUL alpha, s0, s0
|
|
+ ADD a0, s0, a0
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/iamax.S b/kernel/sw_64/iamax.S
|
|
new file mode 100644
|
|
index 0000000..f3b2909
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/iamax.S
|
|
@@ -0,0 +1,440 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 6 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef F_INTERFACE
|
|
+ ldl N, 0(N) # n
|
|
+ ldl INCX, 0(INCX) # incx
|
|
+#endif
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ mov X, XX
|
|
+ .align 4
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr $f16
|
|
+ cmplt $31, N, $2
|
|
+ unop
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fclr $f17
|
|
+ cmplt $31, INCX, $3
|
|
+ unop
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fclr $f18
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ unop
|
|
+
|
|
+ fstd $f5, 24($sp)
|
|
+ fclr $f19
|
|
+ and $2, $3, $2
|
|
+ clr $0
|
|
+
|
|
+ fstd $f6, 32($sp)
|
|
+ fclr $f0
|
|
+ sra N, 3, $1
|
|
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ unop
|
|
+ fabs $f20, $f0
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ fabs $f20, $f1
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ fabs $f20, $f2
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f20, $f3
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ fabs $f20, $f4
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ fabs $f20, $f5
|
|
+ unop
|
|
+
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ fabs $f20, $f6
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fabs $f20, $f28
|
|
+ addl X, INCX, X
|
|
+ ldi $1, -1($1)
|
|
+
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ unop
|
|
+ fabs $f20, $f29
|
|
+ fillcs 56 * SIZE(X)
|
|
+
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ fabs $f21, $f30
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ fabs $f22, $f10
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f23, $f11
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f25, $f13
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ CMPLT($f1, $f30), $f17
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f26, $f14
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ CMPLT($f2, $f10), $f18
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f27, $f15
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ CMPLT($f3, $f11), $f19
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ CMPLT($f4, $f12), $f16
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f17, $f30, $f1, $f1
|
|
+ unop
|
|
+ CMPLT($f5, $f13), $f17
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+ fselne $f18, $f10, $f2, $f2
|
|
+ unop
|
|
+ CMPLT($f6, $f14), $f18
|
|
+ unop
|
|
+
|
|
+ fselne $f19, $f11, $f3, $f3
|
|
+ unop
|
|
+ CMPLT($f28, $f15), $f19
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ fabs $f20, $f29
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ fabs $f21, $f30
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ fabs $f22, $f10
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fabs $f25, $f13
|
|
+ CMPLT($f1, $f30), $f17
|
|
+
|
|
+ fabs $f26, $f14
|
|
+ CMPLT($f2, $f10), $f18
|
|
+ fabs $f27, $f15
|
|
+ CMPLT($f3, $f11), $f19
|
|
+
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+ CMPLT($f4, $f12), $f16
|
|
+ fselne $f17, $f30, $f1, $f1
|
|
+ CMPLT($f5, $f13), $f17
|
|
+
|
|
+ fselne $f18, $f10, $f2, $f2
|
|
+ CMPLT($f6, $f14), $f18
|
|
+ fselne $f19, $f11, $f3, $f3
|
|
+ CMPLT($f28, $f15), $f19
|
|
+
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ CMPLT($f2, $f3), $f17
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ CMPLT($f4, $f5), $f18
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ CMPLT($f6, $f28), $f19
|
|
+
|
|
+ fselne $f16, $f1, $f0, $f0
|
|
+ fselne $f17, $f3, $f2, $f2
|
|
+ fselne $f18, $f5, $f4, $f4
|
|
+ fselne $f19, $f28, $f6, $f6
|
|
+
|
|
+ CMPLT($f0, $f2), $f16
|
|
+ CMPLT($f4, $f6), $f17
|
|
+
|
|
+ fselne $f16, $f2, $f0, $f0
|
|
+ fselne $f17, $f6, $f4, $f4
|
|
+
|
|
+ CMPLT($f0, $f4), $f16
|
|
+ fselne $f16, $f4, $f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $L20
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra N, 3, $1
|
|
+ ble $1, $L40
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+ LD $f11, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+ LD $f13, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+ LD $f15, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+ LD $f17, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+ fabs $f12, $f20
|
|
+ fabs $f13, $f21
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ ble $1, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ fabs $f14, $f22
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+
|
|
+ LD $f11, 0 * SIZE(XX)
|
|
+ fabs $f15, $f23
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f19, $f3
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ fabs $f16, $f24
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f20, $f4
|
|
+
|
|
+ LD $f13, 0 * SIZE(XX)
|
|
+ fabs $f17, $f25
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f21, $f5
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ ldi $1, -1($1) # i --
|
|
+ fcmpeq $f0, $f22, $f26
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f2, $End
|
|
+
|
|
+ LD $f15, 0 * SIZE(XX)
|
|
+ fcmpeq $f0, $f23, $f27
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f3, $End
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f24, $f28
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f4, $End
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ fcmpeq $f0, $f25, $f29
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f5, $End
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f10, $f18
|
|
+ fbne $f26, $End
|
|
+
|
|
+ LD $f17, 0 * SIZE(XX)
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f11, $f19
|
|
+ fbne $f27, $End
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f12, $f20
|
|
+ fbne $f28, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f13, $f21
|
|
+ fbne $f29, $End
|
|
+ bgt $1, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ fabs $f14, $f22
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+ fabs $f15, $f23
|
|
+ fcmpeq $f0, $f19, $f3
|
|
+
|
|
+ fabs $f16, $f24
|
|
+ fcmpeq $f0, $f20, $f4
|
|
+ fabs $f17, $f25
|
|
+ fcmpeq $f0, $f21, $f5
|
|
+
|
|
+ fcmpeq $f0, $f22, $f26
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f2, $End
|
|
+
|
|
+ fcmpeq $f0, $f23, $f27
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f3, $End
|
|
+
|
|
+ fcmpeq $f0, $f24, $f28
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f4, $End
|
|
+
|
|
+ fcmpeq $f0, $f25, $f29
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f5, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f26, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f27, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f28, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ LD $f20, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f20, $f25
|
|
+ fcmpeq $f0, $f25, $f29
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ br $31, $L40
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/iamax_simd.S b/kernel/sw_64/iamax_simd.S
|
|
new file mode 100644
|
|
index 0000000..c7c6c27
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/iamax_simd.S
|
|
@@ -0,0 +1,732 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 96
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+#define I $1
|
|
+#define NN $22
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#endif
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define VCMPLT(a, b) vfcmplt a, b
|
|
+#else
|
|
+#define VCMPLT(a, b) vfcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 6 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef F_INTERFACE
|
|
+ ldl N, 0(N) # n
|
|
+ ldl INCX, 0(INCX) # incx
|
|
+#endif
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ mov X, XX
|
|
+ mov N, NN
|
|
+ .align 4
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr $f16
|
|
+ cmplt $31, N, $2
|
|
+ unop
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fclr $f17
|
|
+ cmplt $31, INCX, $3
|
|
+ unop
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fclr $f18
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ unop
|
|
+
|
|
+ fstd $f5, 24($sp)
|
|
+ fclr $f19
|
|
+ and $2, $3, $2
|
|
+ clr $0
|
|
+
|
|
+ fstd $f6, 32($sp)
|
|
+ fclr $f0
|
|
+ unop
|
|
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ cmpeq INCX, SIZE, $3
|
|
+ beq $3, $Sub
|
|
+ .align 4
|
|
+
|
|
+
|
|
+/**
|
|
+ test the address of Y
|
|
+**/
|
|
+
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ LD $f10, 0*SIZE(X)
|
|
+ fabs $f10, $f0 # init temp max/min result value
|
|
+ beq $3, $Align_Access
|
|
+ .align 4
|
|
+/**
|
|
+ process the unalign address of X
|
|
+**/
|
|
+
|
|
+/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/
|
|
+ sra NN, 4, I
|
|
+ and NN, 15, $3
|
|
+ ble I, $Remain
|
|
+ nop
|
|
+
|
|
+ sra $3, BASE_SHIFT, $3
|
|
+ ldi $2, VEC_LEN
|
|
+ subl $2, $3, $3
|
|
+ nop
|
|
+$UnAlign_Y_Loop:
|
|
+ LD $f10, 0*SIZE(X)
|
|
+ addl X, SIZE, X
|
|
+ fabs $f10, $f29
|
|
+ CMPLT($f0, $f29), $f16
|
|
+
|
|
+ fseleq $f16, $f0, $f29, $f0
|
|
+ subl $3, 1, $3
|
|
+ subl NN, 1, NN
|
|
+ bgt $3, $UnAlign_Y_Loop
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$Align_Access:
|
|
+/*search max or min. Unloop 16 */
|
|
+ sra NN, 4, I
|
|
+ and NN, 15, $3
|
|
+ ble I, $Remain
|
|
+ nop
|
|
+
|
|
+ VLD $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f13, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ /*vfabs*/
|
|
+ vcpys $f31, $f10, $f22
|
|
+ vcpys $f31, $f11, $f23
|
|
+ vcpys $f31, $f12, $f24
|
|
+ vcpys $f31, $f13, $f25
|
|
+
|
|
+ vcpyf $f0, $f0
|
|
+ vcpys $f22, $f22, $f1 # copy $f22 -> $f1
|
|
+ vcpys $f22, $f22, $f2
|
|
+ vcpys $f22, $f22, $f3
|
|
+
|
|
+ subl I, 1, I
|
|
+ addl X, 16*SIZE, X
|
|
+ nop
|
|
+ ble I, $MainLoopEnd
|
|
+ .align 4
|
|
+$MainLoop:
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ VCMPLT($f0, $f22), $f26
|
|
+ subl I, 1, I
|
|
+ VCMPLT($f1, $f23), $f27
|
|
+
|
|
+ VLD $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f13, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VCMPLT($f2, $f24), $f28
|
|
+ addl X, 16 * SIZE, X
|
|
+ nop
|
|
+ VCMPLT($f3, $f25), $f29
|
|
+
|
|
+ vfseleq $f26, $f0, $f22, $f0
|
|
+ vfseleq $f27, $f1, $f23, $f1
|
|
+ vfseleq $f28, $f2, $f24, $f2
|
|
+ vfseleq $f29, $f3, $f25, $f3
|
|
+
|
|
+ vcpys $f31, $f10, $f22
|
|
+ vcpys $f31, $f11, $f23
|
|
+ vcpys $f31, $f12, $f24
|
|
+ vcpys $f31, $f13, $f25
|
|
+
|
|
+ bne I, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+ VCMPLT($f0, $f22), $f26
|
|
+ VCMPLT($f1, $f23), $f27
|
|
+ VCMPLT($f2, $f24), $f28
|
|
+ VCMPLT($f3, $f25), $f29
|
|
+
|
|
+ vfseleq $f26, $f0, $f22, $f0
|
|
+ vfseleq $f27, $f1, $f23, $f1
|
|
+ vfseleq $f28, $f2, $f24, $f2
|
|
+ vfseleq $f29, $f3, $f25, $f3
|
|
+
|
|
+ /*find the max or min among f0, f1 ,f2 and f3*/
|
|
+ VCMPLT($f0, $f1), $f26
|
|
+ VCMPLT($f2, $f3), $f27
|
|
+ vfseleq $f26, $f0, $f1, $f0
|
|
+ vfseleq $f27, $f2, $f3, $f2
|
|
+
|
|
+ VCMPLT($f0, $f2), $f26
|
|
+ vfseleq $f26, $f0, $f2, $f0
|
|
+ vextf $f0, 1, $f22
|
|
+ vextf $f0, 2, $f23
|
|
+
|
|
+ vextf $f0, 3, $f24
|
|
+ CMPLT($f0, $f22), $f16
|
|
+ CMPLT($f23, $f24), $f17
|
|
+ fseleq $f16, $f0, $f22, $f0
|
|
+
|
|
+ fseleq $f17, $f23, $f24, $f23
|
|
+ CMPLT($f0, $f23), $f18
|
|
+ fseleq $f18, $f0, $f23, $f0
|
|
+ nop
|
|
+$Remain:
|
|
+ ble $3, $Continuous_FindIndex
|
|
+ .align 4
|
|
+$RemainLoop:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fseleq $f16, $f0, $f29, $f0
|
|
+
|
|
+ subl $3, 1, $3
|
|
+ bgt $3, $RemainLoop
|
|
+ .align 4
|
|
+ /*find index*/
|
|
+$Continuous_FindIndex:
|
|
+ sra N, 3, $1
|
|
+ ble $1, $L40
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+ LD $f12, 2 * SIZE(XX)
|
|
+ LD $f13, 3 * SIZE(XX)
|
|
+
|
|
+
|
|
+ LD $f14, 4 * SIZE(XX)
|
|
+ LD $f15, 5 * SIZE(XX)
|
|
+ LD $f16, 6 * SIZE(XX)
|
|
+ LD $f17, 7 * SIZE(XX)
|
|
+
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+ fabs $f12, $f20
|
|
+ fabs $f13, $f21
|
|
+
|
|
+ addl XX, 8*SIZE, XX
|
|
+ ldi $1, -1($1)
|
|
+ ble $1, $Continuous_FindIndex_Loop
|
|
+ .align 4
|
|
+
|
|
+$Continuous_FindIndex_Loop:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ fabs $f14, $f22
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+
|
|
+ LD $f12, 2 * SIZE(XX)
|
|
+ fabs $f15, $f23
|
|
+ LD $f13, 3 * SIZE(XX)
|
|
+ fcmpeq $f0, $f19, $f3
|
|
+
|
|
+ LD $f14, 4 * SIZE(XX)
|
|
+ fabs $f16, $f24
|
|
+ ldi $1, -1($1) # i --
|
|
+ fcmpeq $f0, $f20, $f4
|
|
+
|
|
+ LD $f15, 5 * SIZE(XX)
|
|
+ fabs $f17, $f25
|
|
+ fcmpeq $f0, $f21, $f5
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+
|
|
+ LD $f16, 6 * SIZE(XX)
|
|
+ fcmpeq $f0, $f22, $f26
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f2, $End
|
|
+
|
|
+ LD $f17, 7 * SIZE(XX)
|
|
+ fcmpeq $f0, $f23, $f27
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f3, $End
|
|
+
|
|
+ addl XX, 8*SIZE, XX
|
|
+ fcmpeq $f0, $f24, $f28
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f4, $End
|
|
+
|
|
+ fcmpeq $f0, $f25, $f29
|
|
+ ldi $0, 1($0)
|
|
+ nop
|
|
+ fbne $f5, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f10, $f18
|
|
+ nop
|
|
+ fbne $f26, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f11, $f19
|
|
+ nop
|
|
+ fbne $f27, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f12, $f20
|
|
+ nop
|
|
+ fbne $f28, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f13, $f21
|
|
+ fbne $f29, $End
|
|
+ bgt $1, $Continuous_FindIndex_Loop
|
|
+ .align 4
|
|
+
|
|
+$Continuous_FindIndex_LoopEnd:
|
|
+ fabs $f14, $f22
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+ fabs $f15, $f23
|
|
+ fcmpeq $f0, $f19, $f3
|
|
+
|
|
+ fabs $f16, $f24
|
|
+ fcmpeq $f0, $f20, $f4
|
|
+ fabs $f17, $f25
|
|
+ fcmpeq $f0, $f21, $f5
|
|
+
|
|
+ fcmpeq $f0, $f22, $f26
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f2, $End
|
|
+
|
|
+ fcmpeq $f0, $f23, $f27
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f3, $End
|
|
+
|
|
+ fcmpeq $f0, $f24, $f28
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f4, $End
|
|
+
|
|
+ fcmpeq $f0, $f25, $f29
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f5, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f26, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f27, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f28, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ .align 4
|
|
+
|
|
+ jmp $L40
|
|
+ .align 4
|
|
+$Sub:
|
|
+ sra N, 3, $1
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ fabs $f20, $f0
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ fabs $f20, $f1
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ fabs $f20, $f2
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f20, $f3
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ fabs $f20, $f4
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ fabs $f20, $f5
|
|
+ unop
|
|
+
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ fabs $f20, $f6
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fabs $f20, $f28
|
|
+ addl X, INCX, X
|
|
+ ldi $1, -1($1)
|
|
+
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ unop
|
|
+ fabs $f20, $f29
|
|
+ fillcs 56 * SIZE(X)
|
|
+
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ fabs $f21, $f30
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ fabs $f22, $f10
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f23, $f11
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f25, $f13
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ CMPLT($f1, $f30), $f17
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f26, $f14
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ CMPLT($f2, $f10), $f18
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f27, $f15
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ CMPLT($f3, $f11), $f19
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ CMPLT($f4, $f12), $f16
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f17, $f30, $f1, $f1
|
|
+ unop
|
|
+ CMPLT($f5, $f13), $f17
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+ fselne $f18, $f10, $f2, $f2
|
|
+ unop
|
|
+ CMPLT($f6, $f14), $f18
|
|
+ unop
|
|
+
|
|
+ fselne $f19, $f11, $f3, $f3
|
|
+ unop
|
|
+ CMPLT($f28, $f15), $f19
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ fabs $f20, $f29
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ fabs $f21, $f30
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ fabs $f22, $f10
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fabs $f25, $f13
|
|
+ CMPLT($f1, $f30), $f17
|
|
+
|
|
+ fabs $f26, $f14
|
|
+ CMPLT($f2, $f10), $f18
|
|
+ fabs $f27, $f15
|
|
+ CMPLT($f3, $f11), $f19
|
|
+
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+ CMPLT($f4, $f12), $f16
|
|
+ fselne $f17, $f30, $f1, $f1
|
|
+ CMPLT($f5, $f13), $f17
|
|
+
|
|
+ fselne $f18, $f10, $f2, $f2
|
|
+ CMPLT($f6, $f14), $f18
|
|
+ fselne $f19, $f11, $f3, $f3
|
|
+ CMPLT($f28, $f15), $f19
|
|
+
|
|
+ fselne $f16, $f12, $f4, $f4
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ fselne $f17, $f13, $f5, $f5
|
|
+ CMPLT($f2, $f3), $f17
|
|
+
|
|
+ fselne $f18, $f14, $f6, $f6
|
|
+ CMPLT($f4, $f5), $f18
|
|
+ fselne $f19, $f15, $f28, $f28
|
|
+ CMPLT($f6, $f28), $f19
|
|
+
|
|
+ fselne $f16, $f1, $f0, $f0
|
|
+ fselne $f17, $f3, $f2, $f2
|
|
+ fselne $f18, $f5, $f4, $f4
|
|
+ fselne $f19, $f28, $f6, $f6
|
|
+
|
|
+ CMPLT($f0, $f2), $f16
|
|
+ CMPLT($f4, $f6), $f17
|
|
+
|
|
+ fselne $f16, $f2, $f0, $f0
|
|
+ fselne $f17, $f6, $f4, $f4
|
|
+
|
|
+ CMPLT($f0, $f4), $f16
|
|
+ fselne $f16, $f4, $f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $L20
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+/*
|
|
+ find the index
|
|
+*/
|
|
+$L20:
|
|
+ sra N, 3, $1
|
|
+ ble $1, $L40
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+ LD $f11, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+ LD $f13, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+ LD $f15, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+ LD $f17, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+ fabs $f12, $f20
|
|
+ fabs $f13, $f21
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ ble $1, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ fabs $f14, $f22
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+
|
|
+ LD $f11, 0 * SIZE(XX)
|
|
+ fabs $f15, $f23
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f19, $f3
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ fabs $f16, $f24
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f20, $f4
|
|
+
|
|
+ LD $f13, 0 * SIZE(XX)
|
|
+ fabs $f17, $f25
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f21, $f5
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ ldi $1, -1($1) # i --
|
|
+ fcmpeq $f0, $f22, $f26
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f2, $End
|
|
+
|
|
+ LD $f15, 0 * SIZE(XX)
|
|
+ fcmpeq $f0, $f23, $f27
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f3, $End
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+ fcmpeq $f0, $f24, $f28
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f4, $End
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ fcmpeq $f0, $f25, $f29
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f5, $End
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f10, $f18
|
|
+ fbne $f26, $End
|
|
+
|
|
+ LD $f17, 0 * SIZE(XX)
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f11, $f19
|
|
+ fbne $f27, $End
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f12, $f20
|
|
+ fbne $f28, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fabs $f13, $f21
|
|
+ fbne $f29, $End
|
|
+ bgt $1, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ fabs $f14, $f22
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+ fabs $f15, $f23
|
|
+ fcmpeq $f0, $f19, $f3
|
|
+
|
|
+ fabs $f16, $f24
|
|
+ fcmpeq $f0, $f20, $f4
|
|
+ fabs $f17, $f25
|
|
+ fcmpeq $f0, $f21, $f5
|
|
+
|
|
+ fcmpeq $f0, $f22, $f26
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f2, $End
|
|
+
|
|
+ fcmpeq $f0, $f23, $f27
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f3, $End
|
|
+
|
|
+ fcmpeq $f0, $f24, $f28
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f4, $End
|
|
+
|
|
+ fcmpeq $f0, $f25, $f29
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f5, $End
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f26, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f27, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f28, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ LD $f20, 0 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f20, $f25
|
|
+ fcmpeq $f0, $f25, $f29
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ br $31, $L40
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/imax.S b/kernel/sw_64/imax.S
|
|
new file mode 100644
|
|
index 0000000..b0cf5c8
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/imax.S
|
|
@@ -0,0 +1,351 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) cmptlt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) cmptlt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 8 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ clr $0
|
|
+ mov X, XX
|
|
+ .align 4
|
|
+
|
|
+ cmplt $31, N, $2
|
|
+ cmplt $31, INCX, $3
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ and $2, $3, $2
|
|
+
|
|
+ sra N, 3, $1
|
|
+ fclr $f0
|
|
+ unop
|
|
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ LD $f0, 0 * SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ fmov $f0, $f1
|
|
+ addq X, INCX, X
|
|
+ fmov $f0, $f10
|
|
+ lda $1, -1($1)
|
|
+
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ fmov $f0, $f11
|
|
+ addq X, INCX, X
|
|
+ fmov $f0, $f12
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fmov $f0, $f13
|
|
+ addq X, INCX, X
|
|
+ fmov $f0, $f14
|
|
+
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ fmov $f0, $f15
|
|
+ addq X, INCX, X
|
|
+ fmov $f0, $f20
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ addq X, INCX, X
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ addq X, INCX, X
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ addq X, INCX, X
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ addq X, INCX, X
|
|
+
|
|
+ CMPLT($f0, $f20), $f16
|
|
+ CMPLT($f1, $f21), $f17
|
|
+ CMPLT($f10, $f22), $f18
|
|
+ CMPLT($f11, $f23), $f19
|
|
+
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fcmovne $f16, $f20, $f0
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ CMPLT($f12, $f24), $f16
|
|
+ addq X, INCX, X
|
|
+
|
|
+ fcmovne $f17, $f21, $f1
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ CMPLT($f13, $f25), $f17
|
|
+ addq X, INCX, X
|
|
+
|
|
+ fcmovne $f18, $f22, $f10
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ CMPLT($f14, $f26), $f18
|
|
+ addq X, INCX, X
|
|
+
|
|
+ fcmovne $f19, $f23, $f11
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ CMPLT($f15, $f27), $f19
|
|
+ addq X, INCX, X
|
|
+
|
|
+ fcmovne $f16, $f24, $f12
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ CMPLT($f0, $f20), $f16
|
|
+ addq X, INCX, X
|
|
+
|
|
+ fcmovne $f17, $f25, $f13
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ CMPLT($f1, $f21), $f17
|
|
+ addq X, INCX, X
|
|
+
|
|
+ fcmovne $f18, $f26, $f14
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ CMPLT($f10, $f22), $f18
|
|
+ addq X, INCX, X
|
|
+
|
|
+ fcmovne $f19, $f27, $f15
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ CMPLT($f11, $f23), $f19
|
|
+ lda $1, -1($1) # i --
|
|
+
|
|
+ addq X, INCX, X
|
|
+ unop
|
|
+ unop
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ fcmovne $f16, $f20, $f0
|
|
+ CMPLT($f12, $f24), $f16
|
|
+
|
|
+ fcmovne $f17, $f21, $f1
|
|
+ CMPLT($f13, $f25), $f17
|
|
+
|
|
+ fcmovne $f18, $f22, $f10
|
|
+ CMPLT($f14, $f26), $f18
|
|
+
|
|
+ fcmovne $f19, $f23, $f11
|
|
+ CMPLT($f15, $f27), $f19
|
|
+
|
|
+ fcmovne $f16, $f24, $f12
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ fcmovne $f17, $f25, $f13
|
|
+ CMPLT($f10, $f11), $f17
|
|
+
|
|
+ fcmovne $f18, $f26, $f14
|
|
+ CMPLT($f12, $f13), $f18
|
|
+ fcmovne $f19, $f27, $f15
|
|
+ CMPLT($f14, $f15), $f19
|
|
+
|
|
+ fcmovne $f16, $f1, $f0
|
|
+ fcmovne $f17, $f11, $f10
|
|
+ fcmovne $f18, $f13, $f12
|
|
+ fcmovne $f19, $f15, $f14
|
|
+
|
|
+ CMPLT($f0, $f10), $f16
|
|
+ CMPLT($f12, $f14), $f17
|
|
+
|
|
+ fcmovne $f16, $f10, $f0
|
|
+ fcmovne $f17, $f14, $f12
|
|
+
|
|
+ CMPLT($f0, $f12), $f16
|
|
+ fcmovne $f16, $f12, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $L20
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ addq X, INCX, X
|
|
+
|
|
+ CMPLT($f0, $f20), $f16
|
|
+ fcmovne $f16, $f20, $f0
|
|
+ lda $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra N, 3, $1
|
|
+ ble $1, $L40
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+ LD $f11, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+ LD $f13, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+ LD $f15, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+ LD $f17, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+
|
|
+ cmpteq $f0, $f10, $f20
|
|
+ cmpteq $f0, $f11, $f21
|
|
+ cmpteq $f0, $f12, $f22
|
|
+ cmpteq $f0, $f13, $f23
|
|
+
|
|
+ lda $1, -1($1)
|
|
+ ble $1, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ cmpteq $f0, $f14, $f24
|
|
+ lda $0, 1($0)
|
|
+ addq XX, INCX, XX
|
|
+ fbne $f20, $End
|
|
+
|
|
+ LD $f11, 0 * SIZE(XX)
|
|
+ cmpteq $f0, $f15, $f25
|
|
+ lda $0, 1($0)
|
|
+ addq XX, INCX, XX
|
|
+ fbne $f21, $End
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ cmpteq $f0, $f16, $f26
|
|
+ lda $0, 1($0)
|
|
+ addq XX, INCX, XX
|
|
+ fbne $f22, $End
|
|
+
|
|
+ LD $f13, 0 * SIZE(XX)
|
|
+ cmpteq $f0, $f17, $f27
|
|
+ lda $0, 1($0)
|
|
+ addq XX, INCX, XX
|
|
+ fbne $f23, $End
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ cmpteq $f0, $f10, $f20
|
|
+ lda $0, 1($0)
|
|
+ addq XX, INCX, XX
|
|
+ fbne $f24, $End
|
|
+
|
|
+ LD $f15, 0 * SIZE(XX)
|
|
+ cmpteq $f0, $f11, $f21
|
|
+ lda $0, 1($0)
|
|
+ addq XX, INCX, XX
|
|
+ fbne $f25, $End
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ lda $1, -1($1) # i --
|
|
+ cmpteq $f0, $f12, $f22
|
|
+ lda $0, 1($0)
|
|
+ addq XX, INCX, XX
|
|
+ fbne $f26, $End
|
|
+
|
|
+ LD $f17, 0 * SIZE(XX)
|
|
+ cmpteq $f0, $f13, $f23
|
|
+ lda $0, 1($0)
|
|
+ addq XX, INCX, XX
|
|
+ fbne $f27, $End
|
|
+
|
|
+ bgt $1, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ lda $0, 1($0)
|
|
+ cmpteq $f0, $f14, $f24
|
|
+ unop
|
|
+ fbne $f20, $End
|
|
+
|
|
+ lda $0, 1($0)
|
|
+ cmpteq $f0, $f15, $f25
|
|
+ unop
|
|
+ fbne $f21, $End
|
|
+
|
|
+ lda $0, 1($0)
|
|
+ cmpteq $f0, $f16, $f26
|
|
+ unop
|
|
+ fbne $f22, $End
|
|
+
|
|
+ lda $0, 1($0)
|
|
+ cmpteq $f0, $f17, $f27
|
|
+ unop
|
|
+ fbne $f23, $End
|
|
+
|
|
+ lda $0, 1($0)
|
|
+ fbne $f24, $End
|
|
+ lda $0, 1($0)
|
|
+ fbne $f25, $End
|
|
+ lda $0, 1($0)
|
|
+ fbne $f26, $End
|
|
+ lda $0, 1($0)
|
|
+ fbne $f27, $End
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ LD $f20, 0 * SIZE(XX)
|
|
+ addq XX, INCX, XX
|
|
+
|
|
+ cmpteq $f0, $f20, $f29
|
|
+
|
|
+ lda $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ br $31, $L40
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/imax.c b/kernel/sw_64/imax.c
|
|
new file mode 100644
|
|
index 0000000..5072dd1
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/imax.c
|
|
@@ -0,0 +1,69 @@
|
|
+/***************************************************************************
|
|
+Copyright (c) 2013, The OpenBLAS Project
|
|
+All rights reserved.
|
|
+Redistribution and use in source and binary forms, with or without
|
|
+modification, are permitted provided that the following conditions are
|
|
+met:
|
|
+1. Redistributions of source code must retain the above copyright
|
|
+notice, this list of conditions and the following disclaimer.
|
|
+2. Redistributions in binary form must reproduce the above copyright
|
|
+notice, this list of conditions and the following disclaimer in
|
|
+the documentation and/or other materials provided with the
|
|
+distribution.
|
|
+3. Neither the name of the OpenBLAS project nor the names of
|
|
+its contributors may be used to endorse or promote products
|
|
+derived from this software without specific prior written permission.
|
|
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+*****************************************************************************/
|
|
+
|
|
+
|
|
+/**************************************************************************************
|
|
+* 2013/09/14 Saar
|
|
+* BLASTEST float : NoTest
|
|
+* BLASTEST double : NoTest
|
|
+* CTEST : NoTest
|
|
+* TEST : NoTest
|
|
+*
|
|
+**************************************************************************************/
|
|
+
|
|
+#include "common.h"
|
|
+#include <math.h>
|
|
+
|
|
+
|
|
+
|
|
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|
+{
|
|
+ BLASLONG i=0;
|
|
+ BLASLONG ix=0;
|
|
+ FLOAT maxf=0.0;
|
|
+ BLASLONG max=0;
|
|
+
|
|
+ if (n <= 0 || inc_x <= 0) return(max);
|
|
+
|
|
+ maxf=x[0];
|
|
+ ix += inc_x;
|
|
+ i++;
|
|
+
|
|
+ while(i < n)
|
|
+ {
|
|
+ if( x[ix] > maxf )
|
|
+ {
|
|
+ max = i;
|
|
+ maxf = x[ix];
|
|
+ }
|
|
+ ix += inc_x;
|
|
+ i++;
|
|
+ }
|
|
+ return(max+1);
|
|
+}
|
|
+
|
|
+
|
|
diff --git a/kernel/sw_64/imin.c b/kernel/sw_64/imin.c
|
|
new file mode 100644
|
|
index 0000000..ffc6522
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/imin.c
|
|
@@ -0,0 +1,67 @@
|
|
+/***************************************************************************
|
|
+Copyright (c) 2013, The OpenBLAS Project
|
|
+All rights reserved.
|
|
+Redistribution and use in source and binary forms, with or without
|
|
+modification, are permitted provided that the following conditions are
|
|
+met:
|
|
+1. Redistributions of source code must retain the above copyright
|
|
+notice, this list of conditions and the following disclaimer.
|
|
+2. Redistributions in binary form must reproduce the above copyright
|
|
+notice, this list of conditions and the following disclaimer in
|
|
+the documentation and/or other materials provided with the
|
|
+distribution.
|
|
+3. Neither the name of the OpenBLAS project nor the names of
|
|
+its contributors may be used to endorse or promote products
|
|
+derived from this software without specific prior written permission.
|
|
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+*****************************************************************************/
|
|
+
|
|
+
|
|
+/**************************************************************************************
|
|
+* 2013/08/19 Saar
|
|
+* BLASTEST float
|
|
+* BLASTEST double
|
|
+*
|
|
+**************************************************************************************/
|
|
+
|
|
+#include "common.h"
|
|
+#include <math.h>
|
|
+
|
|
+
|
|
+
|
|
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|
+{
|
|
+ BLASLONG i=0;
|
|
+ BLASLONG ix=0;
|
|
+ FLOAT minf=0.0;
|
|
+ BLASLONG min=0;
|
|
+
|
|
+ if (n <= 0 || inc_x <= 0) return(min);
|
|
+
|
|
+ minf=x[0];
|
|
+ ix += inc_x;
|
|
+ i++;
|
|
+
|
|
+ while(i < n)
|
|
+ {
|
|
+ if( x[ix] < minf )
|
|
+ {
|
|
+ min = i;
|
|
+ minf = x[ix];
|
|
+ }
|
|
+ ix += inc_x;
|
|
+ i++;
|
|
+ }
|
|
+ return(min+1);
|
|
+}
|
|
+
|
|
+
|
|
diff --git a/kernel/sw_64/izamax.S b/kernel/sw_64/izamax.S
|
|
new file mode 100644
|
|
index 0000000..5ccc60e
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/izamax.S
|
|
@@ -0,0 +1,429 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 8 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr $f16
|
|
+ cmplt $31, N, $2
|
|
+ unop
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fclr $f17
|
|
+ cmplt $31, INCX, $3
|
|
+ unop
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fclr $f18
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ unop
|
|
+
|
|
+ fstd $f5, 24($sp)
|
|
+ fclr $f19
|
|
+ and $2, $3, $2
|
|
+ clr $0
|
|
+
|
|
+ fstd $f6, 32($sp)
|
|
+ mov X, XX
|
|
+
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ fclr $f0
|
|
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ sra N, 2, $1
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ fabs $f20, $f20
|
|
+ fabs $f21, $f21
|
|
+ faddd $f20, $f21, $f0
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fmov $f0, $f1
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fmov $f0, $f2
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fmov $f0, $f3
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f8
|
|
+ fabs $f21, $f9
|
|
+ fabs $f22, $f10
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ fabs $f25, $f13
|
|
+ fabs $f26, $f14
|
|
+ fabs $f27, $f15
|
|
+
|
|
+ ble $1, $L14
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ ldi $1, -1($1)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd $f8, $f9, $f16
|
|
+ unop
|
|
+ fabs $f20, $f8
|
|
+ fillcs 64 * SIZE(X)
|
|
+
|
|
+ faddd $f10, $f11, $f17
|
|
+ unop
|
|
+ fabs $f21, $f9
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+
|
|
+ faddd $f12, $f13, $f18
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ fabs $f22, $f10
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd $f14, $f15, $f19
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f23, $f11
|
|
+ unop
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ fabs $f24, $f12
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fabs $f25, $f13
|
|
+ unop
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ fabs $f26, $f14
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fabs $f27, $f15
|
|
+ unop
|
|
+
|
|
+ fselne $f4, $f16, $f0, $f0
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+ fselne $f5, $f17, $f1, $f1
|
|
+ fselne $f6, $f18, $f2, $f2
|
|
+ fselne $f7, $f19, $f3, $f3
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ faddd $f8, $f9, $f16
|
|
+ fabs $f20, $f8
|
|
+
|
|
+ faddd $f10, $f11, $f17
|
|
+ fabs $f21, $f9
|
|
+
|
|
+ faddd $f12, $f13, $f18
|
|
+ fabs $f22, $f10
|
|
+
|
|
+ faddd $f14, $f15, $f19
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ fabs $f24, $f12
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ fabs $f25, $f13
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ fabs $f26, $f14
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ fabs $f27, $f15
|
|
+
|
|
+ fselne $f4, $f16, $f0, $f0
|
|
+ fselne $f5, $f17, $f1, $f1
|
|
+ fselne $f6, $f18, $f2, $f2
|
|
+ fselne $f7, $f19, $f3, $f3
|
|
+ .align 4
|
|
+
|
|
+$L14:
|
|
+ faddd $f8, $f9, $f16
|
|
+ faddd $f10, $f11, $f17
|
|
+ faddd $f12, $f13, $f18
|
|
+ faddd $f14, $f15, $f19
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ CMPLT($f3, $f19), $f7
|
|
+
|
|
+ fselne $f4, $f16, $f0, $f0
|
|
+ fselne $f5, $f17, $f1, $f1
|
|
+ fselne $f6, $f18, $f2, $f2
|
|
+ fselne $f7, $f19, $f3, $f3
|
|
+
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ CMPLT($f2, $f3), $f17
|
|
+
|
|
+ fselne $f16, $f1, $f0, $f0
|
|
+ fselne $f17, $f3, $f2, $f2
|
|
+
|
|
+ CMPLT($f0, $f2), $f16
|
|
+ fselne $f16, $f2, $f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 3, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $L20
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ fabs $f21, $f30
|
|
+ faddd $f29, $f30, $f24
|
|
+ fmov $f24,$f29
|
|
+
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra N, 2, $1
|
|
+ ble $1, $L40
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ LD $f13, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ LD $f15, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ LD $f17, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+ fabs $f12, $f20
|
|
+ fabs $f13, $f21
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ ble $1, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ fabs $f14, $f22
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ fabs $f15, $f23
|
|
+ LD $f13, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ fabs $f16, $f24
|
|
+ LD $f15, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ fabs $f17, $f25
|
|
+ LD $f17, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ faddd $f18, $f19, $f4
|
|
+ faddd $f20, $f21, $f5
|
|
+ faddd $f22, $f23, $f6
|
|
+ faddd $f24, $f25, $f7
|
|
+
|
|
+ fcmpeq $f0, $f4, $f26
|
|
+ fcmpeq $f0, $f5, $f27
|
|
+ fcmpeq $f0, $f6, $f28
|
|
+ fcmpeq $f0, $f7, $f29
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ ldi $0, 1($0)
|
|
+ ldi $1, -1($1) # i --
|
|
+ fbne $f26, $End
|
|
+
|
|
+ fabs $f11, $f19
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f27, $End
|
|
+
|
|
+ fabs $f12, $f20
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f28, $End
|
|
+
|
|
+ fabs $f13, $f21
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ bgt $1, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ fabs $f14, $f22
|
|
+ fabs $f15, $f23
|
|
+ fabs $f16, $f24
|
|
+ fabs $f17, $f25
|
|
+
|
|
+ faddd $f18, $f19, $f4
|
|
+ faddd $f20, $f21, $f5
|
|
+ faddd $f22, $f23, $f6
|
|
+ faddd $f24, $f25, $f7
|
|
+
|
|
+ fcmpeq $f0, $f4, $f26
|
|
+ fcmpeq $f0, $f5, $f27
|
|
+ fcmpeq $f0, $f6, $f28
|
|
+ fcmpeq $f0, $f7, $f29
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f26, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f27, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f28, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+
|
|
+ faddd $f18, $f19, $f2
|
|
+ fmov $f2,$f18
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f2, $End
|
|
+ br $31, $L40
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/izamax.S.bak b/kernel/sw_64/izamax.S.bak
|
|
new file mode 100644
|
|
index 0000000..34e4c88
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/izamax.S.bak
|
|
@@ -0,0 +1,427 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 8 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr $f16
|
|
+ cmplt $31, N, $2
|
|
+ unop
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fclr $f17
|
|
+ cmplt $31, INCX, $3
|
|
+ unop
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fclr $f18
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ unop
|
|
+
|
|
+ fstd $f5, 24($sp)
|
|
+ fclr $f19
|
|
+ and $2, $3, $2
|
|
+ clr $0
|
|
+
|
|
+ fstd $f6, 32($sp)
|
|
+ mov X, XX
|
|
+
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ fclr $f0
|
|
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ sra N, 2, $1
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ fabs $f20, $f20
|
|
+ fabs $f21, $f21
|
|
+ faddd $f20, $f21, $f0
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fmov $f0, $f1
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fmov $f0, $f2
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fmov $f0, $f3
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f8
|
|
+ fabs $f21, $f9
|
|
+ fabs $f22, $f10
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ fabs $f25, $f13
|
|
+ fabs $f26, $f14
|
|
+ fabs $f27, $f15
|
|
+
|
|
+ ble $1, $L14
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ ldi $1, -1($1)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd $f8, $f9, $f16
|
|
+ unop
|
|
+ fabs $f20, $f8
|
|
+ fillcs 64 * SIZE(X)
|
|
+
|
|
+ faddd $f10, $f11, $f17
|
|
+ unop
|
|
+ fabs $f21, $f9
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+
|
|
+ faddd $f12, $f13, $f18
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ fabs $f22, $f10
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd $f14, $f15, $f19
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f23, $f11
|
|
+ unop
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ fabs $f24, $f12
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fabs $f25, $f13
|
|
+ unop
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ fabs $f26, $f14
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fabs $f27, $f15
|
|
+ unop
|
|
+
|
|
+fselne $f4,$f16,$f0, $f0
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+fselne $f5,$f17,$f1, $f1
|
|
+fselne $f6,$f18,$f2, $f2
|
|
+fselne $f7,$f19,$f3, $f3
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ faddd $f8, $f9, $f16
|
|
+ fabs $f20, $f8
|
|
+
|
|
+ faddd $f10, $f11, $f17
|
|
+ fabs $f21, $f9
|
|
+
|
|
+ faddd $f12, $f13, $f18
|
|
+ fabs $f22, $f10
|
|
+
|
|
+ faddd $f14, $f15, $f19
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ fabs $f24, $f12
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ fabs $f25, $f13
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ fabs $f26, $f14
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ fabs $f27, $f15
|
|
+
|
|
+fselne $f4,$f16,$f0, $f0
|
|
+fselne $f5,$f17,$f1, $f1
|
|
+fselne $f6,$f18,$f2, $f2
|
|
+fselne $f7,$f19,$f3, $f3
|
|
+ .align 4
|
|
+
|
|
+$L14:
|
|
+ faddd $f8, $f9, $f16
|
|
+ faddd $f10, $f11, $f17
|
|
+ faddd $f12, $f13, $f18
|
|
+ faddd $f14, $f15, $f19
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ CMPLT($f3, $f19), $f7
|
|
+
|
|
+fselne $f4,$f16,$f0, $f0
|
|
+fselne $f5,$f17,$f1, $f1
|
|
+fselne $f6,$f18,$f2, $f2
|
|
+fselne $f7,$f19,$f3, $f3
|
|
+
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ CMPLT($f2, $f3), $f17
|
|
+
|
|
+fselne $f16,$f1,$f0, $f0
|
|
+fselne $f17,$f3,$f2, $f2
|
|
+
|
|
+ CMPLT($f0, $f2), $f16
|
|
+fselne $f16,$f2,$f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 3, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $L20
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ fabs $f21, $f30
|
|
+ faddd $f29, $f30, $f29
|
|
+
|
|
+ CMPLT($f0, $f29), $f16
|
|
+fselne $f16,$f29,$f0, $f0
|
|
+
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra N, 2, $1
|
|
+ ble $1, $L40
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ LD $f13, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ LD $f15, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ LD $f17, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+ fabs $f12, $f20
|
|
+ fabs $f13, $f21
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ ble $1, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ fabs $f14, $f22
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ fabs $f15, $f23
|
|
+ LD $f13, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ fabs $f16, $f24
|
|
+ LD $f15, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ fabs $f17, $f25
|
|
+ LD $f17, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ faddd $f18, $f19, $f4
|
|
+ faddd $f20, $f21, $f5
|
|
+ faddd $f22, $f23, $f6
|
|
+ faddd $f24, $f25, $f7
|
|
+
|
|
+ fcmpeq $f0, $f4, $f26
|
|
+ fcmpeq $f0, $f5, $f27
|
|
+ fcmpeq $f0, $f6, $f28
|
|
+ fcmpeq $f0, $f7, $f29
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ ldi $0, 1($0)
|
|
+ ldi $1, -1($1) # i --
|
|
+ fbne $f26, $End
|
|
+
|
|
+ fabs $f11, $f19
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f27, $End
|
|
+
|
|
+ fabs $f12, $f20
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f28, $End
|
|
+
|
|
+ fabs $f13, $f21
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ bgt $1, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ fabs $f14, $f22
|
|
+ fabs $f15, $f23
|
|
+ fabs $f16, $f24
|
|
+ fabs $f17, $f25
|
|
+
|
|
+ faddd $f18, $f19, $f4
|
|
+ faddd $f20, $f21, $f5
|
|
+ faddd $f22, $f23, $f6
|
|
+ faddd $f24, $f25, $f7
|
|
+
|
|
+ fcmpeq $f0, $f4, $f26
|
|
+ fcmpeq $f0, $f5, $f27
|
|
+ fcmpeq $f0, $f6, $f28
|
|
+ fcmpeq $f0, $f7, $f29
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f26, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f27, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f28, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+
|
|
+ faddd $f18, $f19, $f18
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f2, $End
|
|
+ br $31, $L40
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/izamax_simd.S b/kernel/sw_64/izamax_simd.S
|
|
new file mode 100644
|
|
index 0000000..8b00f60
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/izamax_simd.S
|
|
@@ -0,0 +1,609 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 96
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $2
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#define VCMPLT(a, b) vfcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#define VCMPLT(a, b) vfcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 8 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr $f16
|
|
+ cmplt $31, N, $2
|
|
+ unop
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fclr $f17
|
|
+ cmplt $31, INCX, $3
|
|
+ unop
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fclr $f18
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ unop
|
|
+
|
|
+ fstd $f5, 24($sp)
|
|
+ fclr $f19
|
|
+ and $2, $3, $2
|
|
+ clr $0
|
|
+
|
|
+ fstd $f6, 32($sp)
|
|
+ mov X, XX
|
|
+
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ fclr $f0
|
|
+ cmpeq INCX, SIZE, $3
|
|
+ and X, (VEC_LEN*SIZE-1), $4 # test the address of X (aligment)
|
|
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ bic $3, $4, $3
|
|
+ nop
|
|
+ nop
|
|
+ beq $3, $Sub
|
|
+ .align 4
|
|
+
|
|
+$Align_Access:
|
|
+/*
|
|
+ Unloop 8*2=16 reals
|
|
+*/
|
|
+#ifdef USE_MIN
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ fabs $f20, $f20
|
|
+ fabs $f21, $f21
|
|
+ ADD $f20, $f21, $f0 # init temp min result value
|
|
+#endif
|
|
+ sra N, 3, I
|
|
+ and N, 7, $3
|
|
+ addl INCX, INCX, INCX
|
|
+ ble I, $Remain
|
|
+ .align 4
|
|
+/*
|
|
+ Init max or min value
|
|
+*/
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ fabs $f20, $f20
|
|
+ fabs $f21, $f21
|
|
+
|
|
+ ADD $f20, $f21, $f4
|
|
+ nop
|
|
+ vcpyf $f4, $f0
|
|
+ vcpyf $f4, $f1
|
|
+
|
|
+
|
|
+ VLD $f22, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f23, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f24, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f25, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ /*vfabs*/
|
|
+ vcpys $f31, $f22, $f10
|
|
+ subl I, 1, I
|
|
+ vcpys $f31, $f23, $f11
|
|
+ addl X, 16*SIZE, X
|
|
+
|
|
+ vcpys $f31, $f24, $f12
|
|
+ nop
|
|
+ vcpys $f31, $f25, $f13
|
|
+ ble I, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ vextf $f10, 1, $f4
|
|
+ VLD $f22, 0*VEC_LEN*SIZE(X)
|
|
+ vextf $f10, 3, $f5
|
|
+ VLD $f23, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ vextf $f11, 0, $f6
|
|
+ VLD $f24, 2*VEC_LEN*SIZE(X)
|
|
+ vextf $f11, 2, $f7
|
|
+ VLD $f25, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ vextf $f12, 1, $f14
|
|
+ vextf $f12, 3, $f15
|
|
+ vextf $f13, 0, $f16
|
|
+ vextf $f13, 2, $f17
|
|
+
|
|
+ vinsf $f4, $f11, 0, $f11
|
|
+ vinsf $f6, $f10, 1, $f10
|
|
+ vinsf $f14, $f13, 0, $f13
|
|
+ vinsf $f16, $f12, 1, $f12
|
|
+
|
|
+ vinsf $f5, $f11, 2, $f11
|
|
+ vinsf $f7, $f10, 3, $f10
|
|
+ vinsf $f15, $f13, 2, $f13
|
|
+ vinsf $f17, $f12, 3, $f12
|
|
+
|
|
+ VADD $f10, $f11, $f2
|
|
+ addl X, 16*SIZE, X
|
|
+ VADD $f12, $f13, $f3
|
|
+ subl I, 1, I
|
|
+
|
|
+ vcpys $f31, $f22, $f10
|
|
+ vcpys $f31, $f23, $f11
|
|
+ VCMPLT($f0, $f2), $f18
|
|
+ VCMPLT($f1, $f3), $f19
|
|
+
|
|
+ vcpys $f31, $f24, $f12
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ vcpys $f31, $f25, $f13
|
|
+ nop
|
|
+
|
|
+ vfseleq $f18, $f0, $f2, $f0
|
|
+ vfseleq $f19, $f1, $f3, $f1
|
|
+ nop
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+/*spilt the complex vector to real vector($f10,$f12) and image vector ($f11,$f13)*/
|
|
+ vextf $f10, 1, $f4
|
|
+ vextf $f10, 3, $f5
|
|
+ vextf $f11, 0, $f6
|
|
+ vextf $f11, 2, $f7
|
|
+
|
|
+ vextf $f12, 1, $f14
|
|
+ vextf $f12, 3, $f15
|
|
+ vextf $f13, 0, $f16
|
|
+ vextf $f13, 2, $f17
|
|
+
|
|
+ vinsf $f4, $f11, 0, $f11
|
|
+ vinsf $f6, $f10, 1, $f10
|
|
+ vinsf $f14, $f13, 0, $f13
|
|
+ vinsf $f16, $f12, 1, $f12
|
|
+
|
|
+ vinsf $f5, $f11, 2, $f11
|
|
+ vinsf $f7, $f10, 3, $f10
|
|
+ vinsf $f15, $f13, 2, $f13
|
|
+ vinsf $f17, $f12, 3, $f12
|
|
+
|
|
+ VADD $f10, $f11, $f2
|
|
+ VADD $f12, $f13, $f3
|
|
+ VCMPLT($f0, $f2), $f18
|
|
+ VCMPLT($f1, $f3), $f19
|
|
+
|
|
+ vfseleq $f18, $f0, $f2, $f0
|
|
+ vfseleq $f19, $f1, $f3, $f1
|
|
+/*find the max or min between f0 and f1*/
|
|
+ VCMPLT($f0, $f1), $f18
|
|
+ vfseleq $f18, $f0, $f1, $f0
|
|
+
|
|
+
|
|
+ vextf $f0, 1, $f22
|
|
+ vextf $f0, 2, $f23
|
|
+ vextf $f0, 3, $f24
|
|
+ CMPLT($f0, $f22), $f16
|
|
+
|
|
+ CMPLT($f23, $f24), $f17
|
|
+ fseleq $f16, $f0, $f22, $f0
|
|
+ fseleq $f17, $f23, $f24, $f23
|
|
+ CMPLT($f0, $f23), $f18
|
|
+
|
|
+ fseleq $f18, $f0, $f23, $f0
|
|
+ nop
|
|
+ .align 4
|
|
+$Remain:
|
|
+ ble $3, $Continuous_FindIndex
|
|
+ .align 4
|
|
+$RemainLoop:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, 2*SIZE, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ fabs $f21, $f30
|
|
+ ADD $f29, $f30, $f29
|
|
+
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fselne $f16,$f29,$f0, $f0
|
|
+
|
|
+ subl $3, 1, $3
|
|
+ bgt $3, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+ /*find index*/
|
|
+$Continuous_FindIndex:
|
|
+
|
|
+ jmp $L20
|
|
+
|
|
+$Sub:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ sra N, 2, $1
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ fabs $f20, $f20
|
|
+ fabs $f21, $f21
|
|
+ ADD $f20, $f21, $f0
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fmov $f0, $f1
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fmov $f0, $f2
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fmov $f0, $f3
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f8
|
|
+ fabs $f21, $f9
|
|
+ fabs $f22, $f10
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ fabs $f25, $f13
|
|
+ fabs $f26, $f14
|
|
+ fabs $f27, $f15
|
|
+
|
|
+ ble $1, $L14
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ ldi $1, -1($1)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD $f8, $f9, $f16
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fabs $f20, $f8
|
|
+ fillcs 64 * SIZE(X)
|
|
+
|
|
+ ADD $f10, $f11, $f17
|
|
+ unop
|
|
+ fabs $f21, $f9
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+
|
|
+ ADD $f12, $f13, $f18
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ fabs $f22, $f10
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD $f14, $f15, $f19
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f23, $f11
|
|
+ unop
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ fabs $f24, $f12
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fabs $f25, $f13
|
|
+ unop
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ fabs $f26, $f14
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fabs $f27, $f15
|
|
+ unop
|
|
+
|
|
+ fselne $f4,$f16,$f0, $f0
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+ fselne $f5,$f17,$f1, $f1
|
|
+ fselne $f6,$f18,$f2, $f2
|
|
+ fselne $f7,$f19,$f3, $f3
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD $f8, $f9, $f16
|
|
+ fabs $f20, $f8
|
|
+
|
|
+ ADD $f10, $f11, $f17
|
|
+ fabs $f21, $f9
|
|
+
|
|
+ ADD $f12, $f13, $f18
|
|
+ fabs $f22, $f10
|
|
+
|
|
+ ADD $f14, $f15, $f19
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ fabs $f24, $f12
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ fabs $f25, $f13
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ fabs $f26, $f14
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ fabs $f27, $f15
|
|
+
|
|
+ fselne $f4,$f16,$f0, $f0
|
|
+ fselne $f5,$f17,$f1, $f1
|
|
+ fselne $f6,$f18,$f2, $f2
|
|
+ fselne $f7,$f19,$f3, $f3
|
|
+ .align 4
|
|
+
|
|
+$L14:
|
|
+ ADD $f8, $f9, $f16
|
|
+ ADD $f10, $f11, $f17
|
|
+ ADD $f12, $f13, $f18
|
|
+ ADD $f14, $f15, $f19
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ CMPLT($f3, $f19), $f7
|
|
+
|
|
+ fselne $f4,$f16,$f0, $f0
|
|
+ fselne $f5,$f17,$f1, $f1
|
|
+ fselne $f6,$f18,$f2, $f2
|
|
+ fselne $f7,$f19,$f3, $f3
|
|
+
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ CMPLT($f2, $f3), $f17
|
|
+
|
|
+ fselne $f16,$f1,$f0, $f0
|
|
+ fselne $f17,$f3,$f2, $f2
|
|
+
|
|
+ CMPLT($f0, $f2), $f16
|
|
+ fselne $f16,$f2,$f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 3, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $L20
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ fabs $f21, $f30
|
|
+ ADD $f29, $f30, $f29
|
|
+
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fselne $f16,$f29,$f0, $f0
|
|
+
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra N, 2, $1
|
|
+ ble $1, $L40
|
|
+ .align 4
|
|
+
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ LD $f13, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ LD $f15, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ LD $f17, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+ fabs $f12, $f20
|
|
+ fabs $f13, $f21
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ ble $1, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ fabs $f14, $f22
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f12, 0 * SIZE(XX)
|
|
+ fabs $f15, $f23
|
|
+ LD $f13, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f14, 0 * SIZE(XX)
|
|
+ fabs $f16, $f24
|
|
+ LD $f15, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ LD $f16, 0 * SIZE(XX)
|
|
+ fabs $f17, $f25
|
|
+ LD $f17, 1 * SIZE(XX)
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ ADD $f18, $f19, $f4
|
|
+ ADD $f20, $f21, $f5
|
|
+ ADD $f22, $f23, $f6
|
|
+ ADD $f24, $f25, $f7
|
|
+
|
|
+ fcmpeq $f0, $f4, $f26
|
|
+ fcmpeq $f0, $f5, $f27
|
|
+ fcmpeq $f0, $f6, $f28
|
|
+ fcmpeq $f0, $f7, $f29
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ ldi $0, 1($0)
|
|
+ ldi $1, -1($1) # i --
|
|
+ fbne $f26, $End
|
|
+
|
|
+ fabs $f11, $f19
|
|
+ ldi $0, 1($0)
|
|
+ unop
|
|
+ fbne $f27, $End
|
|
+
|
|
+ fabs $f12, $f20
|
|
+ ldi $0, 1($0)
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fbne $f28, $End
|
|
+
|
|
+ fabs $f13, $f21
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ bgt $1, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ fabs $f14, $f22
|
|
+ fabs $f15, $f23
|
|
+ fabs $f16, $f24
|
|
+ fabs $f17, $f25
|
|
+
|
|
+ ADD $f18, $f19, $f4
|
|
+ ADD $f20, $f21, $f5
|
|
+ ADD $f22, $f23, $f6
|
|
+ ADD $f24, $f25, $f7
|
|
+
|
|
+ fcmpeq $f0, $f4, $f26
|
|
+ fcmpeq $f0, $f5, $f27
|
|
+ fcmpeq $f0, $f6, $f28
|
|
+ fcmpeq $f0, $f7, $f29
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f26, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f27, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f28, $End
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f29, $End
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ LD $f10, 0 * SIZE(XX)
|
|
+ LD $f11, 1 * SIZE(XX)
|
|
+
|
|
+ addl XX, INCX, XX
|
|
+
|
|
+ fabs $f10, $f18
|
|
+ fabs $f11, $f19
|
|
+
|
|
+ ADD $f18, $f19, $f18
|
|
+ fcmpeq $f0, $f18, $f2
|
|
+
|
|
+ ldi $0, 1($0)
|
|
+ fbne $f2, $End
|
|
+ br $31, $L40
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/lsame.S b/kernel/sw_64/lsame.S
|
|
new file mode 100644
|
|
index 0000000..c2c0863
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/lsame.S
|
|
@@ -0,0 +1,77 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#include "version.h"
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+.text
|
|
+ .align 5
|
|
+ .globl lsame_
|
|
+ .ent lsame_
|
|
+lsame_:
|
|
+ .frame $sp,0,$26,0
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ lda $28, _mcount
|
|
+ jsr $28, ($28), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ ldbu $5, 0($16)
|
|
+ ldbu $6, 0($17)
|
|
+// extb $2, $5
|
|
+// extbl $3, $6
|
|
+
|
|
+ subl $5, 96, $1
|
|
+ subl $6, 96, $2
|
|
+ subl $5, 32, $3
|
|
+ subl $6, 32, $4
|
|
+
|
|
+
|
|
+ selgt $1, $3, $5, $5
|
|
+ selgt $2, $4, $6, $6
|
|
+ cmpeq $5, $6, $0
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ ret
|
|
+ .end lsame_
|
|
+ .ident VERSION
|
|
diff --git a/kernel/sw_64/max.S b/kernel/sw_64/max.S
|
|
new file mode 100644
|
|
index 0000000..07925d1
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/max.S
|
|
@@ -0,0 +1,227 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 8 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef F_INTERFACE
|
|
+ ldl N, 0(N) # n
|
|
+ ldl INCX, 0(INCX) # incx
|
|
+#endif
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ nop
|
|
+ .align 4
|
|
+
|
|
+ cmplt $31, N, $2
|
|
+ cmplt $31, INCX, $3
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ and $2, $3, $0
|
|
+
|
|
+ sra N, 3, $1
|
|
+ fclr $f0
|
|
+ unop
|
|
+ beq $0, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ LD $f0, 0 * SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ fmov $f0, $f1
|
|
+ addl X, INCX, X
|
|
+ fmov $f0, $f10
|
|
+ ldi $1, -1($1)
|
|
+
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ fmov $f0, $f11
|
|
+ addl X, INCX, X
|
|
+ fmov $f0, $f12
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fmov $f0, $f13
|
|
+ addl X, INCX, X
|
|
+ fmov $f0, $f14
|
|
+
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ fmov $f0, $f15
|
|
+ addl X, INCX, X
|
|
+ fmov $f0, $f20
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f0, $f20), $f16
|
|
+ CMPLT($f1, $f21), $f17
|
|
+ CMPLT($f10, $f22), $f18
|
|
+ CMPLT($f11, $f23), $f19
|
|
+
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ fselne $f16, $f20, $f0, $f0
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ CMPLT($f12, $f24), $f16
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f17, $f21, $f1, $f1
|
|
+ LD $f21, 0 * SIZE(X)
|
|
+ CMPLT($f13, $f25), $f17
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f18, $f22, $f10, $f10
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ CMPLT($f14, $f26), $f18
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f19, $f23, $f11, $f11
|
|
+ LD $f23, 0 * SIZE(X)
|
|
+ CMPLT($f15, $f27), $f19
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f16, $f24, $f12, $f12
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ CMPLT($f0, $f20), $f16
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f17, $f25, $f13, $f13
|
|
+ LD $f25, 0 * SIZE(X)
|
|
+ CMPLT($f1, $f21), $f17
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f18, $f26, $f14, $f14
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ CMPLT($f10, $f22), $f18
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fselne $f19, $f27, $f15, $f15
|
|
+ LD $f27, 0 * SIZE(X)
|
|
+ CMPLT($f11, $f23), $f19
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+ unop
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ fselne $f16, $f20, $f0, $f0
|
|
+ CMPLT($f12, $f24), $f16
|
|
+
|
|
+ fselne $f17, $f21, $f1, $f1
|
|
+ CMPLT($f13, $f25), $f17
|
|
+
|
|
+ fselne $f18, $f22, $f10, $f10
|
|
+ CMPLT($f14, $f26), $f18
|
|
+
|
|
+ fselne $f19, $f23, $f11, $f11
|
|
+ CMPLT($f15, $f27), $f19
|
|
+
|
|
+ fselne $f16, $f24, $f12, $f12
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ fselne $f17, $f25, $f13, $f13
|
|
+ CMPLT($f10, $f11), $f17
|
|
+
|
|
+ fselne $f18, $f26, $f14, $f14
|
|
+ CMPLT($f12, $f13), $f18
|
|
+ fselne $f19, $f27, $f15, $f15
|
|
+ CMPLT($f14, $f15), $f19
|
|
+
|
|
+ fselne $f16, $f1, $f0, $f0
|
|
+ fselne $f17, $f11, $f10, $f10
|
|
+ fselne $f18, $f13, $f12, $f12
|
|
+ fselne $f19, $f15, $f14, $f14
|
|
+
|
|
+ CMPLT($f0, $f10), $f16
|
|
+ CMPLT($f12, $f14), $f17
|
|
+
|
|
+ fselne $f16, $f10, $f0, $f0
|
|
+ fselne $f17, $f14, $f12, $f12
|
|
+
|
|
+ CMPLT($f0, $f12), $f16
|
|
+ fselne $f16, $f12, $f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $End
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f0, $f20), $f16
|
|
+ fselne $f16, $f20, $f0, $f0
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/nrm2_simd.S b/kernel/sw_64/nrm2_simd.S
|
|
new file mode 100644
|
|
index 0000000..0888454
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/nrm2_simd.S
|
|
@@ -0,0 +1,493 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+
|
|
+ fclr a0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20 #stride access
|
|
+
|
|
+/* test the address of X */
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ fclr t0
|
|
+ nop
|
|
+ bne $3, $UnAlign_ACCESS
|
|
+/*Align access. Use simd instructions.*/
|
|
+ sra N, 4, I
|
|
+ ble I, $Remain
|
|
+
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t0 #clear s0 vector
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t1
|
|
+
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t2
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t3
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ subl I, 1, I
|
|
+ nop
|
|
+ ble I, $MainLoopEnd
|
|
+$MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ VMAD a0, a0, t0, t0
|
|
+ subl I, 1, I
|
|
+ VMAD a1, a1, t1, t1
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ VMAD a2, a2, t2, t2
|
|
+ nop
|
|
+ VMAD a3, a3, t3, t3
|
|
+
|
|
+ VLD a0, -4*VEC_LEN*SIZE(X)
|
|
+ VLD a1, -3*VEC_LEN*SIZE(X)
|
|
+ VLD a2, -2*VEC_LEN*SIZE(X)
|
|
+ VLD a3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+$MainLoopEnd:
|
|
+ VMAD a0, a0, t0, t0
|
|
+ VMAD a1, a1, t1, t1
|
|
+ VMAD a2, a2, t2, t2
|
|
+ VMAD a3, a3, t3, t3
|
|
+
|
|
+ VADD t0, t1, a0
|
|
+ VADD t2, t3, a1
|
|
+ nop
|
|
+ VADD a0, a1, t0
|
|
+
|
|
+ vextf t0, 1, t1
|
|
+ vextf t0, 2, t2
|
|
+ vextf t0, 3, t3
|
|
+ nop
|
|
+
|
|
+ ADD t0, t1, a2
|
|
+ ADD t2, t3, a3
|
|
+ nop
|
|
+ ADD a2, a3, t0
|
|
+
|
|
+ .align 4
|
|
+$Remain:
|
|
+ and N, 15, I
|
|
+ ble I, $End
|
|
+ .align 4
|
|
+$RemainLoop:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ addl X, SIZE, X
|
|
+ MAD a0, a0, t0, t0
|
|
+ subl I, 1, I
|
|
+
|
|
+ bgt I, $RemainLoop
|
|
+ .align 4
|
|
+$End:
|
|
+ SQRT t0, a0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+/*Don't use simd*/
|
|
+
|
|
+$UnAlign_ACCESS:
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 4, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ ADD a0, t0, a0
|
|
+ fillcs (PREFETCHSIZE) * SIZE(X)
|
|
+ MUL x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ mov X, XX
|
|
+ MUL x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ unop
|
|
+ MUL x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ unop
|
|
+ MUL x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ unop
|
|
+ MUL x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ unop
|
|
+ MUL x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ unop
|
|
+ MUL x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ MUL x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ unop
|
|
+ MUL x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ unop
|
|
+ MUL x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ ldi I, -1(I)
|
|
+ MUL x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ MUL x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD a0, t0, a0
|
|
+ mov X, XX
|
|
+ MUL x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ unop
|
|
+ MUL x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ unop
|
|
+ MUL x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ unop
|
|
+ MUL x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ unop
|
|
+ MUL x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ MUL x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x0, x0, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL x1, x1, t1
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ MUL x2, x2, t2
|
|
+ ADD a3, t3, a3
|
|
+ MUL x3, x3, t3
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x4, x4, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL x5, x5, t1
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ MUL x6, x6, t2
|
|
+ ADD a3, t3, a3
|
|
+ MUL x7, x7, t3
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ ADD a2, t2, a2
|
|
+ ADD a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 15, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ ADD a0, t0, a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ MUL x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ MUL x1, x1, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ MUL x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ MUL x3, x3, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ MUL x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ MUL x5, x5, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ MUL x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ MUL x7, x7, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD a0, t0, a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ MUL x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ unop
|
|
+ MUL x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ MUL x2, x2, t2
|
|
+ ADD a3, t3, a3
|
|
+ MUL x3, x3, t3
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x4, x4, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL x5, x5, t1
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ MUL x6, x6, t2
|
|
+ ADD a3, t3, a3
|
|
+ MUL x7, x7, t3
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ ADD a2, t2, a2
|
|
+ ADD a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ ADD a0, t0, a0
|
|
+
|
|
+ ADD a0, a1, a0
|
|
+ ADD a2, a3, a2
|
|
+
|
|
+
|
|
+ ADD a0, a2, a0
|
|
+ SQRT a0, a0
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/rot.S b/kernel/sw_64/rot.S
|
|
new file mode 100644
|
|
index 0000000..3c8624e
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/rot.S
|
|
@@ -0,0 +1,680 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define I $21
|
|
+#define XX $23
|
|
+#define YY $24
|
|
+
|
|
+#define C $f10
|
|
+#define S $f11
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+ ldi $sp, -16($sp)
|
|
+ fstd $f20, 8($sp)
|
|
+
|
|
+ fmov $f21, C
|
|
+ LD S, 16($sp)
|
|
+ cmpeq INCX, 1, $23
|
|
+ cmpeq INCY, 1, $24
|
|
+ ble N, $L998
|
|
+
|
|
+
|
|
+ and $23, $24, $23
|
|
+ beq $23, $L50
|
|
+
|
|
+ sra N, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ LD $f16, 2*SIZE(X)
|
|
+ LD $f17, 2*SIZE(Y)
|
|
+ LD $f18, 3*SIZE(X)
|
|
+ LD $f19, 3*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+
|
|
+ LD $f13, 4*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ MUL C, $f14, $f25
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ MUL S, $f15, $f26
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+ MUL C, $f15, $f27
|
|
+
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ MUL C, $f16, $f21
|
|
+ flds $f31, (PREFETCH_SIZE) * SIZE(X)
|
|
+ unop
|
|
+ LD $f14, 5*SIZE(X)
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ flds $f31, (PREFETCH_SIZE) * SIZE(Y)
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 8*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 8*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 9*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 9*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ LD $f17, 10*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 10*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ LD $f19, 11*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ ldi I, -1(I)
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 11*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ MUL S, $f13, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 12*SIZE(Y)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ unop
|
|
+
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ unop
|
|
+
|
|
+ ST $f26, -1*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, -1*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 5*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ SUB $f23, $f24, $f18
|
|
+ fmov $f18,$f24
|
|
+ LD $f18, 7*SIZE(X)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ ST $f26, 7*SIZE(X)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ ST $f28, 7*SIZE(Y)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f25
|
|
+ SUB $f23, $f24, $f26
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f25, 0*SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+ ST $f26, 0*SIZE(Y)
|
|
+ ldi Y, 1 * SIZE(Y)
|
|
+
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ clr $0
|
|
+ fldd $f20, 8($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ mov X, XX
|
|
+ mov Y, YY
|
|
+
|
|
+ sra N, 3, I
|
|
+ ble I, $L55
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f14, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f16, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f17, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f18, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f19, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ MUL S, $f17, $f22
|
|
+ MUL C, $f17, $f23
|
|
+ MUL S, $f16, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ MUL S, $f19, $f26
|
|
+ MUL C, $f19, $f27
|
|
+ MUL S, $f18, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f14, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f16, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f17, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f18, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f19, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ MUL S, $f17, $f22
|
|
+ MUL C, $f17, $f23
|
|
+ MUL S, $f16, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f20
|
|
+ fmov $f20,$f22
|
|
+ SUB $f23, $f24, $f20
|
|
+ fmov $f20,$f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ MUL S, $f19, $f26
|
|
+ MUL C, $f19, $f27
|
|
+ MUL S, $f18, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f20
|
|
+ fmov $f20,$f26
|
|
+ SUB $f27, $f28, $f20
|
|
+ fmov $f20,$f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ and N, 7, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L56:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f25
|
|
+ SUB $f23, $f24, $f26
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f25, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ST $f26, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ bgt I, $L56
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f20, 8($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+
|
|
+ clr $0
|
|
+# fldd $f20, 8($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/rot.S.bak b/kernel/sw_64/rot.S.bak
|
|
new file mode 100644
|
|
index 0000000..62e9ff9
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/rot.S.bak
|
|
@@ -0,0 +1,624 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define I $21
|
|
+#define XX $23
|
|
+#define YY $24
|
|
+
|
|
+#define C $f10
|
|
+#define S $f11
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fmov $f21, C
|
|
+ LD S, 0($sp)
|
|
+
|
|
+ cmpeq INCX, 1, $23
|
|
+ cmpeq INCY, 1, $24
|
|
+ ble N, $L998
|
|
+
|
|
+ and $23, $24, $23
|
|
+ beq $23, $L50
|
|
+
|
|
+ sra N, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ LD $f16, 2*SIZE(X)
|
|
+ LD $f17, 2*SIZE(Y)
|
|
+ LD $f18, 3*SIZE(X)
|
|
+ LD $f19, 3*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+
|
|
+ LD $f13, 4*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ MUL C, $f14, $f25
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ MUL S, $f15, $f26
|
|
+ ADD $f21, $f22, $f22
|
|
+ MUL C, $f15, $f27
|
|
+
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f24
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ MUL C, $f16, $f21
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ unop
|
|
+ LD $f14, 5*SIZE(X)
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(Y)
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 8*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 8*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 9*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 9*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ LD $f17, 10*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 10*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ LD $f19, 11*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ ldi I, -1(I)
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 11*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ MUL S, $f13, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 12*SIZE(Y)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ unop
|
|
+
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ unop
|
|
+
|
|
+ ST $f26, -1*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, -1*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f24
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 5*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ ADD $f25, $f26, $f26
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 7*SIZE(X)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ ST $f28, 7*SIZE(Y)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f25
|
|
+ SUB $f23, $f24, $f26
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f25, 0*SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+ ST $f26, 0*SIZE(Y)
|
|
+ ldi Y, 1 * SIZE(Y)
|
|
+
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ mov X, XX
|
|
+ mov Y, YY
|
|
+
|
|
+ sra N, 3, I
|
|
+ ble I, $L55
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f14, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f16, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f17, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f18, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f19, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ MUL S, $f17, $f22
|
|
+ MUL C, $f17, $f23
|
|
+ MUL S, $f16, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ MUL S, $f19, $f26
|
|
+ MUL C, $f19, $f27
|
|
+ MUL S, $f18, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f14, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f16, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f17, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f18, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f19, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ MUL S, $f17, $f22
|
|
+ MUL C, $f17, $f23
|
|
+ MUL S, $f16, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ MUL S, $f19, $f26
|
|
+ MUL C, $f19, $f27
|
|
+ MUL S, $f18, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ and N, 7, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L56:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f25
|
|
+ SUB $f23, $f24, $f26
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f25, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ST $f26, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ bgt I, $L56
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/rot_simd.S b/kernel/sw_64/rot_simd.S
|
|
new file mode 100644
|
|
index 0000000..99f3e05
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/rot_simd.S
|
|
@@ -0,0 +1,783 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define I $21
|
|
+#define XX $23
|
|
+#define YY $24
|
|
+
|
|
+#define C $f10
|
|
+#define S $f11
|
|
+
|
|
+#define x0 $f12
|
|
+#define x1 $f14
|
|
+#define x2 $f16
|
|
+#define x3 $f18
|
|
+
|
|
+#define y0 $f13
|
|
+#define y1 $f15
|
|
+#define y2 $f17
|
|
+#define y3 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+#define t4 $f24
|
|
+#define t5 $f25
|
|
+#define t6 $f26
|
|
+#define t7 $f27
|
|
+
|
|
+#define PREFETCHSIZE 80
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fmov $f21, C
|
|
+ LD S, 0($sp)
|
|
+
|
|
+ cmpeq INCX, 1, $23
|
|
+ cmpeq INCY, 1, $24
|
|
+ ble N, $L998
|
|
+
|
|
+ and $23, $24, $23
|
|
+ beq $23, $L50 #incx!=1 or incy !=1
|
|
+
|
|
+/* test the address of X */
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ and Y, (VEC_LEN*SIZE-1), $4
|
|
+ or $3, $4, $4
|
|
+ bne $4, $UnAlign_ACCESS
|
|
+
|
|
+/*Align Accessing*/
|
|
+ sra N, 4, I
|
|
+ ble I, $Remain
|
|
+
|
|
+ vcpyf C, C
|
|
+ vcpyf S, S
|
|
+
|
|
+ VLD x0, 0*VEC_LEN*SIZE(X)
|
|
+ VLD x1, 1*VEC_LEN*SIZE(X)
|
|
+ VLD x2, 2*VEC_LEN*SIZE(X)
|
|
+ VLD x3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD y0, 0*VEC_LEN*SIZE(Y)
|
|
+ VLD y1, 1*VEC_LEN*SIZE(Y)
|
|
+ VLD y2, 2*VEC_LEN*SIZE(Y)
|
|
+ VLD y3, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ subl I, 1, I
|
|
+ ble I, $MainLoopEnd
|
|
+ .align 4
|
|
+$MainLoop:
|
|
+ VMUL C, x0, t0
|
|
+ fillcs (PREFETCHSIZE) * SIZE(X)
|
|
+ VMUL C, x1, t1
|
|
+ fillcs (PREFETCHSIZE) * SIZE(Y)
|
|
+
|
|
+ VMUL C, x2, t2
|
|
+ subl I, 1, I
|
|
+ VMUL C, x3, t3
|
|
+ nop
|
|
+
|
|
+ VMUL S, x0, t4
|
|
+ VLD x0, 0*VEC_LEN*SIZE(X)
|
|
+ VMUL S, x1, t5
|
|
+ VLD x1, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VMUL S, x2, t6
|
|
+ VLD x2, 2*VEC_LEN*SIZE(X)
|
|
+ VMUL S, x3, t7
|
|
+ VLD x3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VMAD S, y0, t0, t0
|
|
+ VMAD S, y1, t1, t1
|
|
+ VMAD S, y2, t2, t2
|
|
+ VMAD S, y3, t3, t3
|
|
+
|
|
+ VMSUB C, y0, t4, t4
|
|
+ VLD y0, 0*VEC_LEN*SIZE(Y)
|
|
+ VMSUB C, y1, t5, t5
|
|
+ VLD y1, 1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VMSUB C, y2, t6, t6
|
|
+ VLD y2, 2*VEC_LEN*SIZE(Y)
|
|
+ VMSUB C, y3, t7, t7
|
|
+ VLD y3, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST t0, -4*VEC_LEN*SIZE(X)
|
|
+ VST t1, -3*VEC_LEN*SIZE(X)
|
|
+ VST t2, -2*VEC_LEN*SIZE(X)
|
|
+ VST t3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VST t4, -4*VEC_LEN*SIZE(Y)
|
|
+ VST t5, -3*VEC_LEN*SIZE(Y)
|
|
+ VST t6, -2*VEC_LEN*SIZE(Y)
|
|
+ VST t7, -1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ nop
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+$MainLoopEnd:
|
|
+ VMUL C, x0, t0
|
|
+ VMUL C, x1, t1
|
|
+ VMUL C, x2, t2
|
|
+ VMUL C, x3, t3
|
|
+
|
|
+ VMUL S, x0, t4
|
|
+ VMUL S, x1, t5
|
|
+ VMUL S, x2, t6
|
|
+ VMUL S, x3, t7
|
|
+
|
|
+ VMAD S, y0, t0, t0
|
|
+ VMAD S, y1, t1, t1
|
|
+ VMAD S, y2, t2, t2
|
|
+ VMAD S, y3, t3, t3
|
|
+
|
|
+ VMSUB C, y0, t4, t4
|
|
+ VMSUB C, y1, t5, t5
|
|
+ VMSUB C, y2, t6, t6
|
|
+ VMSUB C, y3, t7, t7
|
|
+
|
|
+ VST t0, -4*VEC_LEN*SIZE(X)
|
|
+ VST t1, -3*VEC_LEN*SIZE(X)
|
|
+ VST t2, -2*VEC_LEN*SIZE(X)
|
|
+ VST t3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VST t4, -4*VEC_LEN*SIZE(Y)
|
|
+ VST t5, -3*VEC_LEN*SIZE(Y)
|
|
+ VST t6, -2*VEC_LEN*SIZE(Y)
|
|
+ VST t7, -1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ .align 4
|
|
+$Remain:
|
|
+ and N, 15, I
|
|
+ ble I, $End
|
|
+$RemainLoop:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f12, $f24
|
|
+ MAD S, $f13, $f21, $f25
|
|
+ MSUB C, $f13, $f24, $f26
|
|
+
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ST $f25, 0*SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+ ST $f26, 0*SIZE(Y)
|
|
+
|
|
+ ldi Y, 1 * SIZE(Y)
|
|
+ bgt I, $RemainLoop
|
|
+
|
|
+ .align 4
|
|
+$End:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS:
|
|
+
|
|
+ sra N, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ LD $f16, 2*SIZE(X)
|
|
+ LD $f17, 2*SIZE(Y)
|
|
+ LD $f18, 3*SIZE(X)
|
|
+ LD $f19, 3*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+
|
|
+ LD $f13, 4*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ MUL C, $f14, $f25
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ MUL S, $f15, $f26
|
|
+ ADD $f21, $f22, $f22
|
|
+ MUL C, $f15, $f27
|
|
+
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f24
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ MUL C, $f16, $f21
|
|
+ fillcs (PREFETCHSIZE) * SIZE(X)
|
|
+ unop
|
|
+ LD $f14, 5*SIZE(X)
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ fillcs (PREFETCHSIZE) * SIZE(Y)
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 8*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 8*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 9*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 9*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ LD $f17, 10*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 10*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ LD $f19, 11*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ ldi I, -1(I)
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 11*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ MUL S, $f13, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 12*SIZE(Y)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ unop
|
|
+
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ unop
|
|
+
|
|
+ ST $f26, -1*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, -1*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f24
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 5*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ ADD $f25, $f26, $f26
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 7*SIZE(X)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ ST $f28, 7*SIZE(Y)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f25
|
|
+ SUB $f23, $f24, $f26
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f25, 0*SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+ ST $f26, 0*SIZE(Y)
|
|
+ ldi Y, 1 * SIZE(Y)
|
|
+
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ mov X, XX
|
|
+ mov Y, YY
|
|
+
|
|
+ sra N, 3, I
|
|
+ ble I, $L55
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f14, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f16, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f17, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f18, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f19, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ MUL S, $f17, $f22
|
|
+ MUL C, $f17, $f23
|
|
+ MUL S, $f16, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ MUL S, $f19, $f26
|
|
+ MUL C, $f19, $f27
|
|
+ MUL S, $f18, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f14, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f16, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f17, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD $f18, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f19, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ MUL S, $f17, $f22
|
|
+ MUL C, $f17, $f23
|
|
+ MUL S, $f16, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ MUL S, $f19, $f26
|
|
+ MUL C, $f19, $f27
|
|
+ MUL S, $f18, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 0*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 0*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ and N, 7, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L56:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f25
|
|
+ SUB $f23, $f24, $f26
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f25, 0*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ST $f26, 0*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ bgt I, $L56
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/scal-sw.S.bak b/kernel/sw_64/scal-sw.S.bak
|
|
new file mode 100644
|
|
index 0000000..f8da324
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/scal-sw.S.bak
|
|
@@ -0,0 +1,480 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $20
|
|
+#define INCX $21
|
|
+
|
|
+#define XX $18
|
|
+#define I $19
|
|
+
|
|
+#define ALPHA $f19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f21
|
|
+
|
|
+#define t0 $f22
|
|
+#define t1 $f23
|
|
+#define t2 $f24
|
|
+#define t3 $f25
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ mov X, XX
|
|
+ ble N, $L999
|
|
+
|
|
+ cmpeq INCX, 1, $0
|
|
+ beq $0, $L20
|
|
+
|
|
+#ifndef DOUBLE
|
|
+ sra N, 4, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ ST t0, 0 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 1 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 2 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 3 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ LD a0, 8 * SIZE(X)
|
|
+ LD a1, 9 * SIZE(X)
|
|
+ LD a2, 10 * SIZE(X)
|
|
+ LD a3, 11 * SIZE(X)
|
|
+
|
|
+ ST t0, 4 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t1, 5 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ ST t2, 6 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ ST t3, 7 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ LD a4, 12 * SIZE(X)
|
|
+ LD a5, 13 * SIZE(X)
|
|
+ LD a6, 14 * SIZE(X)
|
|
+ LD a7, 15 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ST t0, 8 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 9 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 10 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 11 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ LD a0, 16 * SIZE(X)
|
|
+ LD a1, 17 * SIZE(X)
|
|
+ LD a2, 18 * SIZE(X)
|
|
+ LD a3, 19 * SIZE(X)
|
|
+
|
|
+ ST t0, 12 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t1, 13 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ ST t2, 14 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ ST t3, 15 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ LD a4, 20 * SIZE(X)
|
|
+ LD a5, 21 * SIZE(X)
|
|
+ LD a6, 22 * SIZE(X)
|
|
+ LD a7, 23 * SIZE(X)
|
|
+
|
|
+ ST t0, 16 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 17 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 18 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 19 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ LD a0, 24 * SIZE(X)
|
|
+ LD a1, 25 * SIZE(X)
|
|
+ LD a2, 26 * SIZE(X)
|
|
+ LD a3, 27 * SIZE(X)
|
|
+
|
|
+ ST t0, 20 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t1, 21 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ ST t2, 22 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ ST t3, 23 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ LD a4, 28 * SIZE(X)
|
|
+ LD a5, 29 * SIZE(X)
|
|
+ LD a6, 30 * SIZE(X)
|
|
+ LD a7, 31 * SIZE(X)
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ addl X, 16 * SIZE, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ST t0, 8 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 9 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 10 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 11 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ ST t0, 12 * SIZE(X)
|
|
+ ST t1, 13 * SIZE(X)
|
|
+ ST t2, 14 * SIZE(X)
|
|
+ ST t3, 15 * SIZE(X)
|
|
+ addl X, 16 * SIZE, X
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 15, I
|
|
+
|
|
+#else
|
|
+
|
|
+ sra N, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ST t0, 0 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 1 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 2 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 3 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ LD a0, 8 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD a1, 9 * SIZE(X)
|
|
+ addl X, 8 * SIZE, X
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ ST t0, -4 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t1, -3 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ ST t2, -2 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ ST t3, -1 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ST t0, 0 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 1 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 2 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 3 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ ST t0, 4 * SIZE(X)
|
|
+ ST t1, 5 * SIZE(X)
|
|
+ ST t2, 6 * SIZE(X)
|
|
+ ST t3, 7 * SIZE(X)
|
|
+ addl X, 8 * SIZE, X
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+
|
|
+#endif
|
|
+
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+
|
|
+ MUL a0, ALPHA, t0
|
|
+
|
|
+ ST t0, 0 * SIZE(X)
|
|
+
|
|
+ addl X, SIZE, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra N, 3, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ldi I, -1(I)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+ SXADDQ INCX, X, X
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA, t0
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ldi I, -1(I)
|
|
+ unop
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a6, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a0, ALPHA, t0
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a1, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a3, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+ bne I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA, t0
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a6, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+
|
|
+ MUL a0, ALPHA, t0
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L27
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/scal.S b/kernel/sw_64/scal.S
|
|
new file mode 100644
|
|
index 0000000..87b89c9
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/scal.S
|
|
@@ -0,0 +1,480 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $20
|
|
+#define INCX $21
|
|
+
|
|
+#define XX $18
|
|
+#define I $19
|
|
+
|
|
+#define ALPHA $f19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f21
|
|
+
|
|
+#define t0 $f22
|
|
+#define t1 $f23
|
|
+#define t2 $f24
|
|
+#define t3 $f25
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ mov X, XX
|
|
+ ble N, $L999
|
|
+
|
|
+ cmpeq INCX, 1, $0
|
|
+ beq $0, $L20
|
|
+
|
|
+#ifndef DOUBLE
|
|
+ sra N, 4, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ ST t0, 0 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 1 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 2 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 3 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ LD a0, 8 * SIZE(X)
|
|
+ LD a1, 9 * SIZE(X)
|
|
+ LD a2, 10 * SIZE(X)
|
|
+ LD a3, 11 * SIZE(X)
|
|
+
|
|
+ ST t0, 4 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t1, 5 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ ST t2, 6 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ ST t3, 7 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ LD a4, 12 * SIZE(X)
|
|
+ LD a5, 13 * SIZE(X)
|
|
+ LD a6, 14 * SIZE(X)
|
|
+ LD a7, 15 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ST t0, 8 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 9 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 10 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 11 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ LD a0, 16 * SIZE(X)
|
|
+ LD a1, 17 * SIZE(X)
|
|
+ LD a2, 18 * SIZE(X)
|
|
+ LD a3, 19 * SIZE(X)
|
|
+
|
|
+ ST t0, 12 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t1, 13 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ ST t2, 14 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ ST t3, 15 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ LD a4, 20 * SIZE(X)
|
|
+ LD a5, 21 * SIZE(X)
|
|
+ LD a6, 22 * SIZE(X)
|
|
+ LD a7, 23 * SIZE(X)
|
|
+
|
|
+ ST t0, 16 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 17 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 18 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 19 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ LD a0, 24 * SIZE(X)
|
|
+ LD a1, 25 * SIZE(X)
|
|
+ LD a2, 26 * SIZE(X)
|
|
+ LD a3, 27 * SIZE(X)
|
|
+
|
|
+ ST t0, 20 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t1, 21 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ ST t2, 22 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ ST t3, 23 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ LD a4, 28 * SIZE(X)
|
|
+ LD a5, 29 * SIZE(X)
|
|
+ LD a6, 30 * SIZE(X)
|
|
+ LD a7, 31 * SIZE(X)
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ addl X, 16 * SIZE, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ST t0, 8 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 9 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 10 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 11 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ ST t0, 12 * SIZE(X)
|
|
+ ST t1, 13 * SIZE(X)
|
|
+ ST t2, 14 * SIZE(X)
|
|
+ ST t3, 15 * SIZE(X)
|
|
+ addl X, 16 * SIZE, X
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 15, I
|
|
+
|
|
+#else
|
|
+
|
|
+ sra N, 3, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ST t0, 0 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 1 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 2 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 3 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ LD a0, 8 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD a1, 9 * SIZE(X)
|
|
+ addl X, 8 * SIZE, X
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ ST t0, -4 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t1, -3 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+
|
|
+ ST t2, -2 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ ST t3, -1 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ST t0, 0 * SIZE(X)
|
|
+ MUL a4, ALPHA, t0
|
|
+ ST t1, 1 * SIZE(X)
|
|
+ MUL a5, ALPHA, t1
|
|
+
|
|
+ ST t2, 2 * SIZE(X)
|
|
+ MUL a6, ALPHA, t2
|
|
+ ST t3, 3 * SIZE(X)
|
|
+ MUL a7, ALPHA, t3
|
|
+
|
|
+ ST t0, 4 * SIZE(X)
|
|
+ ST t1, 5 * SIZE(X)
|
|
+ ST t2, 6 * SIZE(X)
|
|
+ ST t3, 7 * SIZE(X)
|
|
+ addl X, 8 * SIZE, X
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+
|
|
+#endif
|
|
+
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+
|
|
+ MUL a0, ALPHA, t0
|
|
+
|
|
+ ST t0, 0 * SIZE(X)
|
|
+
|
|
+ addl X, SIZE, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra N, 3, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ldi I, -1(I)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+ SXADDQ INCX, X, X
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA, t0
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ldi I, -1(I)
|
|
+ unop
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a6, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a0, ALPHA, t0
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a1, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a3, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+ bne I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA, t0
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a6, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+
|
|
+ MUL a0, ALPHA, t0
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L27
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/scal_simd.S b/kernel/sw_64/scal_simd.S
|
|
new file mode 100644
|
|
index 0000000..7462e99
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/scal_simd.S
|
|
@@ -0,0 +1,344 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 144
|
|
+
|
|
+#define N $16
|
|
+#define X $20
|
|
+#define INCX $21
|
|
+
|
|
+#define XX $18
|
|
+#define I $19
|
|
+
|
|
+#define ALPHA $f19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f21
|
|
+
|
|
+#define t0 $f22
|
|
+#define t1 $f23
|
|
+#define t2 $f24
|
|
+#define t3 $f25
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+ mov X, XX
|
|
+ ble N, $L999
|
|
+
|
|
+ cmpeq INCX, 1, $0
|
|
+ beq $0, $L20
|
|
+
|
|
+/**
|
|
+ test the address of X
|
|
+**/
|
|
+ and X, (VEC_LEN*SIZE-1), $4
|
|
+ beq $4, $Align_X_Access
|
|
+
|
|
+ .align 5
|
|
+/**
|
|
+ process the unalign address of X
|
|
+**/
|
|
+ sra N, 4, I
|
|
+ ble I, $Remain /*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/
|
|
+
|
|
+ sra $4, BASE_SHIFT, $4
|
|
+ ldi $3, VEC_LEN
|
|
+ subl $3, $4, $4
|
|
+ subl N, $4, N
|
|
+
|
|
+$UnAlign_X_Loop:
|
|
+ LD a0, 0*SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ST t0, 0*SIZE(X)
|
|
+ addl X, SIZE, X
|
|
+
|
|
+
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ bgt $4, $UnAlign_X_Loop
|
|
+ .align 5
|
|
+
|
|
+$Align_X_Access:
|
|
+
|
|
+/*
|
|
+ Unloop 16
|
|
+*/
|
|
+ sra N, 4, I
|
|
+ vcpyf ALPHA, ALPHA
|
|
+ ble I, $Remain
|
|
+
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $MainLoop_End
|
|
+ .align 5
|
|
+$MainLoop:
|
|
+ VMUL a0, ALPHA, t0
|
|
+ VLD a0, 4*VEC_LEN*SIZE(X)
|
|
+ VMUL a1, ALPHA, t1
|
|
+ VLD a1, 5*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VMUL a2, ALPHA, t2
|
|
+ VLD a2, 6*VEC_LEN*SIZE(X)
|
|
+ VMUL a3, ALPHA, t3
|
|
+ VLD a3, 7*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VST t0, 0*VEC_LEN*SIZE(X)
|
|
+ VST t1, 1*VEC_LEN*SIZE(X)
|
|
+ VST t2, 2*VEC_LEN*SIZE(X)
|
|
+ VST t3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ addl X, 16 * SIZE, X
|
|
+ bne I, $MainLoop
|
|
+ .align 5
|
|
+
|
|
+$MainLoop_End:
|
|
+ VMUL a0, ALPHA, t0
|
|
+ VST t0, 0*VEC_LEN*SIZE(X)
|
|
+ VMUL a1, ALPHA, t1
|
|
+ VST t1, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VMUL a2, ALPHA, t2
|
|
+ VST t2, 2*VEC_LEN*SIZE(X)
|
|
+ VMUL a3, ALPHA, t3
|
|
+ VST t3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ .align 5
|
|
+
|
|
+$Remain:
|
|
+ and N, 15, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 5
|
|
+
|
|
+$L17:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+
|
|
+ MUL a0, ALPHA, t0
|
|
+
|
|
+ ST t0, 0 * SIZE(X)
|
|
+
|
|
+ addl X, SIZE, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ ret
|
|
+ .align 5
|
|
+
|
|
+$L20:
|
|
+ sra N, 3, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a0, ALPHA, t0
|
|
+ ldi I, -1(I)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ MUL a1, ALPHA, t1
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a2, ALPHA, t2
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ MUL a3, ALPHA, t3
|
|
+ SXADDQ INCX, X, X
|
|
+ ble I, $L23
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA, t0
|
|
+/*
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+*/
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ldi I, -1(I)
|
|
+ unop
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a6, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a0, ALPHA, t0
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a1, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a3, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+ unop
|
|
+
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ unop
|
|
+ bne I, $L22
|
|
+ .align 5
|
|
+
|
|
+$L23:
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA, t0
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA, t1
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ MUL a6, ALPHA, t2
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA, t3
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t1, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t2, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST t3, 0 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ .align 5
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 5
|
|
+
|
|
+$L27:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+
|
|
+ MUL a0, ALPHA, t0
|
|
+
|
|
+ ST t0, 0 * SIZE(XX)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L27
|
|
+ .align 5
|
|
+
|
|
+$L999:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/snrm2.S b/kernel/sw_64/snrm2.S
|
|
new file mode 100644
|
|
index 0000000..ff1ec57
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/snrm2.S
|
|
@@ -0,0 +1,491 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+#define x8 $f24
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ .frame $30,16,$26,0
|
|
+ .mask 0x4000000,-16
|
|
+ ldih $29, 0($27) !gpdisp!1
|
|
+ ldi $29, 0($29) !gpdisp!1
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ ldl $27, sqrt($29) !literal!2
|
|
+ stl $26, 0($sp)
|
|
+
|
|
+ PROFCODE
|
|
+ .prologue 1
|
|
+#else
|
|
+ PROFCODE
|
|
+#endif
|
|
+
|
|
+ fclr a0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 4, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ faddd a0, t0, x8
|
|
+ fmov x8,a0
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1,x8
|
|
+ fmov x8,a1
|
|
+ mov X, XX
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2,x8
|
|
+ fmov x8,a2
|
|
+ #unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3,x8
|
|
+ fmov x8,a3
|
|
+ #unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, x8
|
|
+ fmov x8,a0
|
|
+ #unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, x8
|
|
+ fmov x8,a1
|
|
+ #unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ #unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ #unop
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, x8
|
|
+ fmov x8,a0
|
|
+ #unop
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1,x8
|
|
+ fmov x8,a1
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ #unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3,x8
|
|
+ fmov x8,a3
|
|
+ #unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, x8
|
|
+ fmov x8,a0
|
|
+ #unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1,x8
|
|
+ fmov x8,a1
|
|
+ ldi I, -1(I)
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ #unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3,x8
|
|
+ fmov x8,a3
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd a0, t0,x8
|
|
+ fmov x8,a0
|
|
+ mov X, XX
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1,x8
|
|
+ fmov x8,a1
|
|
+ #unop
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2,x8
|
|
+ fmov x8,a2
|
|
+ #unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ #unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, x8
|
|
+ fmov x8,a0
|
|
+ #unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, x8
|
|
+ fmov x8,a1
|
|
+ #unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ #unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3,x8
|
|
+ fmov x8,a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0,x8
|
|
+ fmov x8,a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, x8
|
|
+ fmov x8,a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, x8
|
|
+ fmov x8,a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, x8
|
|
+ fmov x8,a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a1, t1, x8
|
|
+ fmov x8,a1
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 15, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0,x8
|
|
+ fmov x8,a0
|
|
+ fmuld x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ faddd a0, t0, x8
|
|
+ fmov x8,a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1,x8
|
|
+ fmov x8,a1
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a2, t2,x8
|
|
+ fmov x8,a2
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ fmuld x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3,x8
|
|
+ fmov x8,a3
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fmuld x3, x3, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0,x8
|
|
+ fmov x8,a0
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ fmuld x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1,x8
|
|
+ fmov x8,a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ fmuld x5, x5, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a2, t2,x8
|
|
+ fmov x8,a2
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ fmuld x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ faddd a0, t0,x8
|
|
+ fmov x8,a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, x8
|
|
+ fmov x8,a1
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ faddd a2, t2,x8
|
|
+ fmov x8,a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, x8
|
|
+ fmov x8,a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, x8
|
|
+ fmov x8,a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a1, t1, x8
|
|
+ fmov x8,a1
|
|
+ faddd a2, t2, x8
|
|
+ fmov x8,a2
|
|
+ faddd a3, t3, x8
|
|
+ fmov x8,a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0,x8
|
|
+ fmov x8,a0
|
|
+ fmuld x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ faddd a0, t0,x8
|
|
+ fmov x8,a0
|
|
+
|
|
+ faddd a0, a1, x8
|
|
+ fmov x8,a1
|
|
+ faddd a2, a3, x8
|
|
+ fmov x8,a2
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ faddd a0, a2, $f16
|
|
+ jsr $26, ($27), sqrt !lituse_jsr!2
|
|
+
|
|
+ ldih $29, 0($26) !gpdisp!3
|
|
+ ldi $29, 0($29) !gpdisp!3
|
|
+#else
|
|
+ faddd a0, a2,x8
|
|
+ fsqrtd x8, a0
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ ldl $26, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/snrm2.S.bak b/kernel/sw_64/snrm2.S.bak
|
|
new file mode 100644
|
|
index 0000000..753c90b
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/snrm2.S.bak
|
|
@@ -0,0 +1,431 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ .frame $30,16,$26,0
|
|
+ .mask 0x4000000,-16
|
|
+ ldih $29, 0($27) !gpdisp!1
|
|
+ ldi $29, 0($29) !gpdisp!1
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ ldl $27, sqrt($29) !literal!2
|
|
+ stq $26, 0($sp)
|
|
+
|
|
+ PROFCODE
|
|
+ .prologue 1
|
|
+#else
|
|
+ PROFCODE
|
|
+#endif
|
|
+
|
|
+ fclr a0
|
|
+ SXADDQ INCX, 0, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 4, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ faddd a0, t0, a0
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ mov X, XX
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ ldi I, -1(I)
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd a0, t0, a0
|
|
+ mov X, XX
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ faddd a2, t2, a2
|
|
+ faddd a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 15, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi X, 1 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ faddd a0, t0, a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ LD x1, 0 * SIZE(X)
|
|
+ fmuld x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fmuld x3, x3, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ LD x3, 0 * SIZE(X)
|
|
+ fmuld x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ fmuld x5, x5, t1
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ LD x5, 0 * SIZE(X)
|
|
+ fmuld x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ faddd a0, t0, a0
|
|
+ LD x7, 0 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ faddd a2, t2, a2
|
|
+ faddd a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ faddd a0, t0, a0
|
|
+
|
|
+ faddd a0, a1, a0
|
|
+ faddd a2, a3, a2
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ faddd a0, a2, $f16
|
|
+ jsr $26, ($27), sqrt !lituse_jsr!2
|
|
+
|
|
+ ldih $29, 0($26) !gpdisp!3
|
|
+ ldi $29, 0($29) !gpdisp!3
|
|
+#else
|
|
+ faddd a0, a2, a0
|
|
+ fsqrtd a0, a0
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ ldl $26, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/staticbuffer.S b/kernel/sw_64/staticbuffer.S
|
|
new file mode 100644
|
|
index 0000000..7bbd23d
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/staticbuffer.S
|
|
@@ -0,0 +1,45 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+
|
|
+#ifdef ALLOC_STATIC
|
|
+ .align 8
|
|
+ .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384
|
|
+#endif
|
|
diff --git a/kernel/sw_64/sum.S b/kernel/sw_64/sum.S
|
|
new file mode 100644
|
|
index 0000000..0be6d53
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/sum.S
|
|
@@ -0,0 +1,230 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define I $19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ fclr s0
|
|
+ unop
|
|
+ fclr t0
|
|
+ ble N, $L999
|
|
+
|
|
+ sra N, 3, I
|
|
+ fclr s1
|
|
+ fclr s2
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fclr t1
|
|
+ SXADDQ INCX, X, X
|
|
+ fclr t2
|
|
+
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ SXADDQ INCX, X, X
|
|
+ fclr s3
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ ldw $31, PREFETCHSIZE * 2 * SIZE(X)
|
|
+ fmov a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fmov a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ fmov a2, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fmov a3, t3
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ LD a1, 0 * SIZE(X)
|
|
+ fmov a4, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fmov a5, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ LD a3, 0 * SIZE(X)
|
|
+ fmov a6, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ fmov a7, t3
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a5, 0 * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fmov a0, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a7, 0 * SIZE(X)
|
|
+ fmov a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ fmov a2, t2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ fmov a3, t3
|
|
+
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ fmov a4, t0
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ fmov a5, t1
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ fmov a6, t2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ fmov a7, t3
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+
|
|
+ ADD s0, s1, $f24
|
|
+ fmov $f24,s0
|
|
+ ADD s2, s3, $f24
|
|
+ fmov $f24,s2
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ADD s0, s2, $f24
|
|
+ fmov $f24,s0
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ fmov a0, t0
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/sw_fpcr.S b/kernel/sw_64/sw_fpcr.S
|
|
new file mode 100644
|
|
index 0000000..5dee238
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/sw_fpcr.S
|
|
@@ -0,0 +1,39 @@
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+ .arch sw2b
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+.text
|
|
+ .align 5
|
|
+ .globl read_fpcr
|
|
+ .ent read_fpcr
|
|
+read_fpcr:
|
|
+ .frame $sp, 0, $26, 0
|
|
+ RFPCR $f10
|
|
+ fstd $f10, 0($16)
|
|
+ ret
|
|
+ .end read_fpcr
|
|
+
|
|
+ .globl write_fpcr
|
|
+ .ent write_fpcr
|
|
+write_fpcr:
|
|
+ .frame $sp, 0, $26, 0
|
|
+ fldd $f10, 0($16)
|
|
+ WFPCR $f10
|
|
+ ret
|
|
+ .end write_fpcr
|
|
+/**
|
|
+ .globl fadd_test
|
|
+ .ent fadd_test
|
|
+
|
|
+fadd_test:
|
|
+ .frame $sp, 0, $26, 0
|
|
+ faddd $f16, $f17, $f16
|
|
+ fmov $f16, $f0
|
|
+ ret
|
|
+ .end fadd_test
|
|
+**/
|
|
+ .ident VERSION
|
|
+
|
|
diff --git a/kernel/sw_64/sw_fpcr_inline.c b/kernel/sw_64/sw_fpcr_inline.c
|
|
new file mode 100644
|
|
index 0000000..1943e3e
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/sw_fpcr_inline.c
|
|
@@ -0,0 +1,13 @@
|
|
+#include "common.h"
|
|
+
|
|
+void read_fpcr(long * test){
|
|
+
|
|
+ __asm__("rfpcr $f10 \n fstd $f10, %0":"=m"(*test):);
|
|
+ return;
|
|
+}
|
|
+
|
|
+void write_fpcr(long * test){
|
|
+
|
|
+ __asm__("fldd $f10, %0\nwfpcr $f10"::"m"(*test));
|
|
+ return;
|
|
+}
|
|
diff --git a/kernel/sw_64/swap.S b/kernel/sw_64/swap.S
|
|
new file mode 100644
|
|
index 0000000..5c8b679
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/swap.S
|
|
@@ -0,0 +1,249 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+ mov $20, $17
|
|
+ mov $21, $18
|
|
+ ldl $19, 0($sp)
|
|
+ ldl $20, 8($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ subl $18, 1, $1
|
|
+ subl $20, 1, $2
|
|
+ ble $16, $SubEnd # if n <= 0 goto $End
|
|
+ or $1, $2, $1
|
|
+
|
|
+ sra $16, 3, $21
|
|
+
|
|
+ and $16, 7, $22
|
|
+ bne $1, $Sub
|
|
+ ble $21, $MainRemain
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f11, 1*SIZE($19)
|
|
+ LD $f12, 2*SIZE($19)
|
|
+ LD $f13, 3*SIZE($19)
|
|
+ LD $f14, 4*SIZE($19)
|
|
+ LD $f15, 5*SIZE($19)
|
|
+ LD $f16, 6*SIZE($19)
|
|
+ LD $f17, 7*SIZE($19)
|
|
+
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ LD $f21, 1*SIZE($17)
|
|
+ LD $f22, 2*SIZE($17)
|
|
+ LD $f23, 3*SIZE($17)
|
|
+ LD $f24, 4*SIZE($17)
|
|
+ LD $f25, 5*SIZE($17)
|
|
+ LD $f26, 6*SIZE($17)
|
|
+ LD $f27, 7*SIZE($17)
|
|
+
|
|
+ fillcs 32*SIZE($17)
|
|
+ unop
|
|
+ fillcs 32*SIZE($19)
|
|
+ subl $21, 1, $21
|
|
+
|
|
+ ST $f10, 0*SIZE($17)
|
|
+ ST $f11, 1*SIZE($17)
|
|
+ ST $f12, 2*SIZE($17)
|
|
+ ST $f13, 3*SIZE($17)
|
|
+ ST $f14, 4*SIZE($17)
|
|
+ ST $f15, 5*SIZE($17)
|
|
+ ST $f16, 6*SIZE($17)
|
|
+ ST $f17, 7*SIZE($17)
|
|
+
|
|
+ ST $f20, 0*SIZE($19)
|
|
+ ST $f21, 1*SIZE($19)
|
|
+ ST $f22, 2*SIZE($19)
|
|
+ ST $f23, 3*SIZE($19)
|
|
+ ST $f24, 4*SIZE($19)
|
|
+ ST $f25, 5*SIZE($19)
|
|
+ ST $f26, 6*SIZE($19)
|
|
+ ST $f27, 7*SIZE($19)
|
|
+
|
|
+ ldi $17, 8*SIZE($17)
|
|
+ ldi $19, 8*SIZE($19)
|
|
+ bgt $21, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainRemain:
|
|
+ ble $22, $MainEnd
|
|
+ .align 4
|
|
+
|
|
+$MainRemainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ ldi $17, 1*SIZE($17)
|
|
+ ldi $19, 1*SIZE($19)
|
|
+ subl $22, 1, $22
|
|
+ ST $f10, -1*SIZE($17)
|
|
+ ST $f20, -1*SIZE($19)
|
|
+ bgt $22, $MainRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ mov $17, $23
|
|
+ mov $19, $24
|
|
+
|
|
+ ble $21, $SubRemain
|
|
+ .align 4
|
|
+
|
|
+$SubLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+ LD $f11, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f12, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+ LD $f13, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f14, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+ LD $f15, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f16, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+ LD $f17, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+ LD $f21, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f22, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+ LD $f23, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f24, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+ LD $f25, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f26, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+ LD $f27, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ ST $f10, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+ ST $f11, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f12, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+ ST $f13, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f14, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+ ST $f15, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f16, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+ ST $f17, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f20, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+ ST $f21, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f22, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+ ST $f23, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f24, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+ ST $f25, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f26, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+ ST $f27, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ subl $21, 1, $21
|
|
+ bgt $21, $SubLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ ble $22, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f20, 0*SIZE($17)
|
|
+
|
|
+ subl $22, 1, $22
|
|
+
|
|
+ ST $f10, 0*SIZE($17)
|
|
+ ST $f20, 0*SIZE($19)
|
|
+
|
|
+ SXADDQ $18, $17, $17
|
|
+ SXADDQ $20, $19, $19
|
|
+ bgt $22, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/swap_simd.S b/kernel/sw_64/swap_simd.S
|
|
new file mode 100644
|
|
index 0000000..8a6141d
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/swap_simd.S
|
|
@@ -0,0 +1,327 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 64
|
|
+#define X $17
|
|
+#define Y $19
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+ mov $20, $17
|
|
+ mov $21, $18
|
|
+ ldl $19, 0($sp)
|
|
+ ldl $20, 8($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ subl $18, 1, $1
|
|
+ subl $20, 1, $2
|
|
+ ble $16, $SubEnd # if n <= 0 goto $End
|
|
+ or $1, $2, $1
|
|
+
|
|
+/*
|
|
+ Unloop 16
|
|
+*/
|
|
+ sra $16, 4, $21
|
|
+ and $16, 15, $22
|
|
+ bne $1, $Sub
|
|
+ ble $21, $MainRemain
|
|
+ .align 4
|
|
+
|
|
+/*
|
|
+ test the address of Y & X
|
|
+*/
|
|
+ and Y, (VEC_LEN*SIZE-1), $4
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ or $3, $4, $4
|
|
+ bne $4, $UnAlign_ACCESS
|
|
+
|
|
+/* align access*/
|
|
+
|
|
+$MainLoop:
|
|
+ VLD $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VLD $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VLD $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VLD $f13, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+
|
|
+ VLD $f20, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f21, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f22, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f23, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ unop
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ subl $21, 1, $21
|
|
+
|
|
+ VST $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VST $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VST $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VST $f13, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VST $f20, 0*VEC_LEN*SIZE(Y)
|
|
+ VST $f21, 1*VEC_LEN*SIZE(Y)
|
|
+ VST $f22, 2*VEC_LEN*SIZE(Y)
|
|
+ VST $f23, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ ldi $17, 16*SIZE(X)
|
|
+ ldi $19, 16*SIZE(Y)
|
|
+ bgt $21, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainRemain:
|
|
+ ble $22, $MainEnd
|
|
+ .align 4
|
|
+
|
|
+$MainRemainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ ldi $17, 1*SIZE($17)
|
|
+ ldi $19, 1*SIZE($19)
|
|
+ subl $22, 1, $22
|
|
+ ST $f10, -1*SIZE($17)
|
|
+ ST $f20, -1*SIZE($19)
|
|
+ bgt $22, $MainRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS:
|
|
+ sra $16, 3, $21
|
|
+ and $16, 7, $22
|
|
+ nop
|
|
+ ble $21, $UnAlign_ACCESS_MainRemain
|
|
+ .align 4
|
|
+$UnAlign_ACCESS_MainLoop:
|
|
+ LD $f10, 0*SIZE(Y)
|
|
+ LD $f11, 1*SIZE(Y)
|
|
+ LD $f12, 2*SIZE(Y)
|
|
+ LD $f13, 3*SIZE(Y)
|
|
+ LD $f14, 4*SIZE(Y)
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ LD $f16, 6*SIZE(Y)
|
|
+ LD $f17, 7*SIZE(Y)
|
|
+
|
|
+ LD $f20, 0*SIZE(X)
|
|
+ LD $f21, 1*SIZE(X)
|
|
+ LD $f22, 2*SIZE(X)
|
|
+ LD $f23, 3*SIZE(X)
|
|
+ LD $f24, 4*SIZE(X)
|
|
+ LD $f25, 5*SIZE(X)
|
|
+ LD $f26, 6*SIZE(X)
|
|
+ LD $f27, 7*SIZE(X)
|
|
+
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ unop
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ subl $21, 1, $21
|
|
+
|
|
+ ST $f10, 0*SIZE(X)
|
|
+ ST $f11, 1*SIZE(X)
|
|
+ ST $f12, 2*SIZE(X)
|
|
+ ST $f13, 3*SIZE(X)
|
|
+ ST $f14, 4*SIZE(X)
|
|
+ ST $f15, 5*SIZE(X)
|
|
+ ST $f16, 6*SIZE(X)
|
|
+ ST $f17, 7*SIZE(X)
|
|
+
|
|
+ ST $f20, 0*SIZE(Y)
|
|
+ ST $f21, 1*SIZE(Y)
|
|
+ ST $f22, 2*SIZE(Y)
|
|
+ ST $f23, 3*SIZE(Y)
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ ST $f25, 5*SIZE(Y)
|
|
+ ST $f26, 6*SIZE(Y)
|
|
+ ST $f27, 7*SIZE(Y)
|
|
+
|
|
+ ldi X, 8*SIZE(X)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ bgt $21, $UnAlign_ACCESS_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS_MainRemain:
|
|
+ ble $22, $UnAlign_ACCESS_MainEnd
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS_MainRemainLoop:
|
|
+ LD $f10, 0*SIZE(Y)
|
|
+ LD $f20, 0*SIZE(X)
|
|
+ ldi X, 1*SIZE(X)
|
|
+ ldi Y, 1*SIZE(Y)
|
|
+ subl $22, 1, $22
|
|
+ ST $f10, -1*SIZE(X)
|
|
+ ST $f20, -1*SIZE(Y)
|
|
+ bgt $22, $UnAlign_ACCESS_MainRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS_MainEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ sra $16, 3, $21
|
|
+ and $16, 7, $22
|
|
+ mov $17, $23
|
|
+ mov $19, $24
|
|
+
|
|
+ ble $21, $SubRemain
|
|
+ .align 4
|
|
+
|
|
+$SubLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+ LD $f11, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f12, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+ LD $f13, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f14, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+ LD $f15, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f16, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+ LD $f17, 0*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+ LD $f21, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f22, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+ LD $f23, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f24, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+ LD $f25, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f26, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+ LD $f27, 0*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ ST $f10, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+ ST $f11, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f12, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+ ST $f13, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f14, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+ ST $f15, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f16, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+ ST $f17, 0*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f20, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+ ST $f21, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f22, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+ ST $f23, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f24, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+ ST $f25, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f26, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+ ST $f27, 0*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ subl $21, 1, $21
|
|
+ bgt $21, $SubLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ ble $22, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f20, 0*SIZE($17)
|
|
+
|
|
+ subl $22, 1, $22
|
|
+
|
|
+ ST $f10, 0*SIZE($17)
|
|
+ ST $f20, 0*SIZE($19)
|
|
+
|
|
+ SXADDQ $18, $17, $17
|
|
+ SXADDQ $20, $19, $19
|
|
+ bgt $22, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S b/kernel/sw_64/trsm_kernel_4x4_LN.S
|
|
new file mode 100644
|
|
index 0000000..109c471
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/trsm_kernel_4x4_LN.S
|
|
@@ -0,0 +1,5144 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $22
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+#define C3 $25
|
|
+#define C4 $27
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+#define tmp $9
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl C, 0 + STACKSIZE($sp)
|
|
+ ldl LDC, 8 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp)
|
|
+
|
|
+ SXADDQ LDC, 0, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ stl tmp, 64($sp)
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ mull M, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ M, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negq OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mulq N, K, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mulq N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 2, J
|
|
+ ble J, $L40
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ s4addl LDC, 0, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C3
|
|
+#ifndef RT
|
|
+ s4addl LDC, C, C
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ addl C3, LDC, C4
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ and M, 1, I
|
|
+ ble I, $L20
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ ble KK, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ ble TMP1, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ LD b5, 3 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, b1, t1
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, b2, t2
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a2, b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ bgt L, $L32
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L37
|
|
+#else
|
|
+ blbs TMP1, $L37
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+$L38:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c13, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+ ldi C3, -1 * SIZE(C3)
|
|
+ ldi C4, -1 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 2, I
|
|
+ ble I, $L30
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble KK, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble TMP1, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ unop
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ SUB b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ SUB b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB b2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b3, c13, b5
|
|
+ fmov b5, c13
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a3, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c13, 6 * SIZE(AO)
|
|
+ ST c14, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+ ldi C3, -2 * SIZE(C3)
|
|
+ ldi C4, -2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+ ldi C3, 2 * SIZE(C3)
|
|
+ ldi C4, 2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ sra M, 2, I
|
|
+ ble I, $L39
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble KK, $L18
|
|
+#else
|
|
+
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble TMP1, $L18
|
|
+#endif
|
|
+
|
|
+ ble L, $L15
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+/* 2 */
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ldi L, -2(L)
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, a1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL b1, a4, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, b5
|
|
+ fmov b5, t3
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, b5
|
|
+ fmov b5, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, b5
|
|
+ fmov b5, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, b5
|
|
+ fmov b5, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, b5
|
|
+ fmov b5, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, a4, b5
|
|
+ fmov b5, t2
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, a1, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL b3, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ MUL b4, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL b2, a3, b5
|
|
+ fmov b5, t4
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL b3, a4, b5
|
|
+ fmov b5, t2
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ MUL b4, a4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ SUB b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD a1, 8 * SIZE(BO)
|
|
+ LD a2, 9 * SIZE(BO)
|
|
+ LD a3, 10 * SIZE(BO)
|
|
+ LD a4, 11 * SIZE(BO)
|
|
+
|
|
+ LD b1, 12 * SIZE(BO)
|
|
+ LD b2, 13 * SIZE(BO)
|
|
+ LD b3, 14 * SIZE(BO)
|
|
+ LD b4, 15 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a2, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB a3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB a4, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ SUB b1, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b2, c08, b5
|
|
+ fmov b5, c08
|
|
+ SUB b3, c12, b5
|
|
+ fmov b5, c12
|
|
+ SUB b4, c16, b5
|
|
+ fmov b5, c16
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+
|
|
+ LD b1, 12 * SIZE(AO)
|
|
+ LD b2, 13 * SIZE(AO)
|
|
+ LD b3, 14 * SIZE(AO)
|
|
+ LD b4, 15 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB a3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB a4, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ SUB b1, c13, b5
|
|
+ fmov b5, c13
|
|
+ SUB b2, c14, b5
|
|
+ fmov b5, c14
|
|
+ SUB b3, c15, b5
|
|
+ fmov b5, c15
|
|
+ SUB b4, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+ MUL a1, c16, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL b1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL b1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+ MUL a3, c12, b5
|
|
+ fmov b5, c12
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL b1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+ MUL a3, c15, b5
|
|
+ fmov b5, c15
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+ MUL a1, c15, b5
|
|
+ fmov b5, c15
|
|
+ MUL a1, c16, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL b1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+
|
|
+ ST c03, 8 * SIZE(BO)
|
|
+ ST c07, 9 * SIZE(BO)
|
|
+ ST c11, 10 * SIZE(BO)
|
|
+ ST c15, 11 * SIZE(BO)
|
|
+
|
|
+ ST c04, 12 * SIZE(BO)
|
|
+ ST c08, 13 * SIZE(BO)
|
|
+ ST c12, 14 * SIZE(BO)
|
|
+ ST c16, 15 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+
|
|
+ ST c09, 8 * SIZE(AO)
|
|
+ ST c10, 9 * SIZE(AO)
|
|
+ ST c11, 10 * SIZE(AO)
|
|
+ ST c12, 11 * SIZE(AO)
|
|
+
|
|
+ ST c13, 12 * SIZE(AO)
|
|
+ ST c14, 13 * SIZE(AO)
|
|
+ ST c15, 14 * SIZE(AO)
|
|
+ ST c16, 15 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+ ldi C3, -4 * SIZE(C3)
|
|
+ ldi C4, -4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c11, 2 * SIZE(C3)
|
|
+ ST c12, 3 * SIZE(C3)
|
|
+
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+ ST c15, 2 * SIZE(C4)
|
|
+ ST c16, 3 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+ ldi C3, 4 * SIZE(C3)
|
|
+ ldi C4, 4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L39:
|
|
+#ifdef LN
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ and N, 2, J
|
|
+ ble J, $L80
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ addl LDC, LDC, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ fclr t1
|
|
+#ifndef RT
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ and M, 1, I
|
|
+ ble I, $L60
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L72:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t3, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD b3, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ LD b4, 5 * SIZE(BO)
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L72
|
|
+ .align 4
|
|
+
|
|
+$L75:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L77
|
|
+#else
|
|
+ blbs TMP1, $L77
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L77:
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c02, t3, b5
|
|
+ fmov b5, c02
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ ADD c01, c02, b5
|
|
+ fmov b5, c01
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ADD c05, c06, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L78:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L60:
|
|
+ and M, 2, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L62:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a4, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L62
|
|
+ .align 4
|
|
+
|
|
+$L65:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L67
|
|
+#else
|
|
+ blbs TMP1, $L67
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L67:
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ .align 4
|
|
+
|
|
+$L68:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L70:
|
|
+ sra M, 2, I
|
|
+ ble I, $L79
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble KK, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ unop
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a5, b3, b5
|
|
+ fmov b5, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ SUB b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB b2, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b3, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a3, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c07, 5 * SIZE(BO)
|
|
+ ST c04, 6 * SIZE(BO)
|
|
+ ST c08, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L79:
|
|
+#ifdef LN
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L80:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+#ifndef RT
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ and M, 1, I
|
|
+ ble I, $L100
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L112:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ bgt L, $L112
|
|
+ .align 4
|
|
+
|
|
+$L115:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L118
|
|
+ .align 4
|
|
+
|
|
+$L116:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L116
|
|
+ .align 4
|
|
+
|
|
+$L118:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c01, c02, b5
|
|
+ fmov b5, c01
|
|
+ ADD c03, c04, b5
|
|
+ fmov b5, c03
|
|
+ ADD c01, c03, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ SXADDQ K, AORIG, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L100:
|
|
+ and M, 2, I
|
|
+ ble I, $L110
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L105
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ ble L, $L105
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L102:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 7 * SIZE(AO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L102
|
|
+ .align 4
|
|
+
|
|
+$L105:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L108
|
|
+ .align 4
|
|
+
|
|
+$L106:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ unop
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L106
|
|
+ .align 4
|
|
+
|
|
+$L108:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c01, c03, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c04, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L110:
|
|
+ sra M, 2, I
|
|
+ ble I, $L119
|
|
+ .align 4
|
|
+
|
|
+$L91:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L95
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L92:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi L, -1(L)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 12 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 13 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 14 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b3, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 15 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 16 * SIZE(AO)
|
|
+ ldi AO, 16 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L92
|
|
+ .align 4
|
|
+
|
|
+$L95:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L98
|
|
+ .align 4
|
|
+
|
|
+$L96:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ bgt L, $L96
|
|
+ .align 4
|
|
+
|
|
+$L98:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L91
|
|
+ .align 4
|
|
+
|
|
+$L119:
|
|
+#ifdef LN
|
|
+ SXADDQ K, B, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl tmp, 64($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S.bak b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak
|
|
new file mode 100644
|
|
index 0000000..8405570
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak
|
|
@@ -0,0 +1,4073 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW2B
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP nop
|
|
+#endif
|
|
+
|
|
+#ifdef EV6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $22
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+#define C3 $25
|
|
+#define C4 $27
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl C, 0 + STACKSIZE($sp)
|
|
+ ldl LDC, 8 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp)
|
|
+
|
|
+ SXADDQ LDC, 0, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ mull M, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ M, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negq OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 2, J
|
|
+ ble J, $L40
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ s4addl LDC, 0, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C3
|
|
+#ifndef RT
|
|
+ s4addl LDC, C, C
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ addl C3, LDC, C4
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ and M, 1, I
|
|
+ ble I, $L20
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ ble KK, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ ble TMP1, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ LD b5, 3 * SIZE(BO)
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a2, b1, t1
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a2, b2, t2
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a2, b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ bgt L, $L32
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L37
|
|
+#else
|
|
+ blbs TMP1, $L37
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ ADD c09, t3, c09
|
|
+ MUL a1, b3, t3
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c05, t2, c05
|
|
+ ADD c09, t3, c09
|
|
+ ADD c13, t4, c13
|
|
+
|
|
+$L38:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a3, c01, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL a4, c01, t1
|
|
+ SUB c13, t1, c13
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b2, c05, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL b3, c05, t1
|
|
+ SUB c13, t1, c13
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a2, c09, t1
|
|
+ SUB c13, t1, c13
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a2, c13, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL a3, c13, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a4, c13, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b2, c09, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL b3, c09, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a2, c05, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c13, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+ ldi C3, -1 * SIZE(C3)
|
|
+ ldi C4, -1 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 2, I
|
|
+ ble I, $L30
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble KK, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble TMP1, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD c09, t1, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD c10, t2, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD c13, t3, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD c05, t3, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ ADD c10, t2, c10
|
|
+ ADD c13, t3, c13
|
|
+ ADD c14, t4, c14
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+
|
|
+ SUB b1, c02, c02
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c10, c10
|
|
+ SUB b4, c14, c14
|
|
+
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c05, c05
|
|
+ SUB a4, c06, c06
|
|
+
|
|
+ SUB b1, c09, c09
|
|
+ SUB b2, c10, c10
|
|
+ SUB b3, c13, c13
|
|
+ SUB b4, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c14, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c09, c09
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+ MUL a2, c09, t3
|
|
+ MUL a2, c13, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c06, c06
|
|
+ MUL a3, c10, c10
|
|
+ MUL a3, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c02, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b1, c06, c06
|
|
+
|
|
+ MUL b2, c05, t1
|
|
+ MUL b2, c06, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL b3, c05, t1
|
|
+ MUL b3, c06, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ MUL a2, c09, t1
|
|
+ MUL a2, c10, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ MUL a3, c13, c13
|
|
+ MUL a3, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c13, t1
|
|
+ MUL a2, c14, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a3, c13, t1
|
|
+ MUL a3, c14, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a4, c13, t1
|
|
+ MUL a4, c14, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b1, c10, c10
|
|
+
|
|
+ MUL b2, c09, t1
|
|
+ MUL b2, c10, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL b3, c09, t1
|
|
+ MUL b3, c10, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c13, 6 * SIZE(AO)
|
|
+ ST c14, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+ ldi C3, -2 * SIZE(C3)
|
|
+ ldi C4, -2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+ ldi C3, 2 * SIZE(C3)
|
|
+ ldi C4, 2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ sra M, 2, I
|
|
+ ble I, $L39
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble KK, $L18
|
|
+#else
|
|
+
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble TMP1, $L18
|
|
+#endif
|
|
+
|
|
+ ble L, $L15
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD c11, t1, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD c01, t1, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD c11, t1, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD c03, t1, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD c11, t1, c11
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c11, t1, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD c03, t1, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD c08, t3, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD c09, t1, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD c14, t3, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c11, t1, c11
|
|
+ ADD c12, t2, c12
|
|
+ ADD c16, t3, c16
|
|
+ ADD c15, t4, c15
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+
|
|
+ SUB b1, c02, c02
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c10, c10
|
|
+ SUB b4, c14, c14
|
|
+
|
|
+ LD a1, 8 * SIZE(BO)
|
|
+ LD a2, 9 * SIZE(BO)
|
|
+ LD a3, 10 * SIZE(BO)
|
|
+ LD a4, 11 * SIZE(BO)
|
|
+
|
|
+ LD b1, 12 * SIZE(BO)
|
|
+ LD b2, 13 * SIZE(BO)
|
|
+ LD b3, 14 * SIZE(BO)
|
|
+ LD b4, 15 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c03, c03
|
|
+ SUB a2, c07, c07
|
|
+ SUB a3, c11, c11
|
|
+ SUB a4, c15, c15
|
|
+
|
|
+ SUB b1, c04, c04
|
|
+ SUB b2, c08, c08
|
|
+ SUB b3, c12, c12
|
|
+ SUB b4, c16, c16
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c05, c05
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c07, c07
|
|
+ SUB b4, c08, c08
|
|
+
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+
|
|
+ LD b1, 12 * SIZE(AO)
|
|
+ LD b2, 13 * SIZE(AO)
|
|
+ LD b3, 14 * SIZE(AO)
|
|
+ LD b4, 15 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c09, c09
|
|
+ SUB a2, c10, c10
|
|
+ SUB a3, c11, c11
|
|
+ SUB a4, c12, c12
|
|
+
|
|
+ SUB b1, c13, c13
|
|
+ SUB b2, c14, c14
|
|
+ SUB b3, c15, c15
|
|
+ SUB b4, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c08, c08
|
|
+ MUL a1, c12, c12
|
|
+ MUL a1, c16, c16
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c08, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c16, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL a3, c04, t1
|
|
+ MUL a3, c08, t2
|
|
+ MUL a3, c12, t3
|
|
+ MUL a3, c16, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c08, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c16, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b1, c07, c07
|
|
+ MUL b1, c11, c11
|
|
+ MUL b1, c15, c15
|
|
+
|
|
+ MUL b2, c03, t1
|
|
+ MUL b2, c07, t2
|
|
+ MUL b2, c11, t3
|
|
+ MUL b2, c15, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL b3, c03, t1
|
|
+ MUL b3, c07, t2
|
|
+ MUL b3, c11, t3
|
|
+ MUL b3, c15, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c14, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c09, c09
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+ MUL a2, c09, t3
|
|
+ MUL a2, c13, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c05, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c13, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c05, t2
|
|
+ MUL a4, c09, t3
|
|
+ MUL a4, c13, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b1, c06, c06
|
|
+ MUL b1, c10, c10
|
|
+ MUL b1, c14, c14
|
|
+
|
|
+ MUL b2, c02, t1
|
|
+ MUL b2, c06, t2
|
|
+ MUL b2, c10, t3
|
|
+ MUL b2, c14, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL b3, c02, t1
|
|
+ MUL b3, c06, t2
|
|
+ MUL b3, c10, t3
|
|
+ MUL b3, c14, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c15, c15
|
|
+
|
|
+ MUL a2, c03, t1
|
|
+ MUL a2, c07, t2
|
|
+ MUL a2, c11, t3
|
|
+ MUL a2, c15, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ MUL a3, c04, c04
|
|
+ MUL a3, c08, c08
|
|
+ MUL a3, c12, c12
|
|
+ MUL a3, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+ MUL a2, c03, t3
|
|
+ MUL a2, c04, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c02, t2
|
|
+ MUL a4, c03, t3
|
|
+ MUL a4, c04, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b1, c06, c06
|
|
+ MUL b1, c07, c07
|
|
+ MUL b1, c08, c08
|
|
+
|
|
+ MUL b2, c05, t1
|
|
+ MUL b2, c06, t2
|
|
+ MUL b2, c07, t3
|
|
+ MUL b2, c08, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL b3, c05, t1
|
|
+ MUL b3, c06, t2
|
|
+ MUL b3, c07, t3
|
|
+ MUL b3, c08, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ MUL a2, c09, t1
|
|
+ MUL a2, c10, t2
|
|
+ MUL a2, c11, t3
|
|
+ MUL a2, c12, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ MUL a3, c13, c13
|
|
+ MUL a3, c14, c14
|
|
+ MUL a3, c15, c15
|
|
+ MUL a3, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a1, c14, c14
|
|
+ MUL a1, c15, c15
|
|
+ MUL a1, c16, c16
|
|
+
|
|
+ MUL a2, c13, t1
|
|
+ MUL a2, c14, t2
|
|
+ MUL a2, c15, t3
|
|
+ MUL a2, c16, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a3, c13, t1
|
|
+ MUL a3, c14, t2
|
|
+ MUL a3, c15, t3
|
|
+ MUL a3, c16, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a4, c13, t1
|
|
+ MUL a4, c14, t2
|
|
+ MUL a4, c15, t3
|
|
+ MUL a4, c16, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b1, c10, c10
|
|
+ MUL b1, c11, c11
|
|
+ MUL b1, c12, c12
|
|
+
|
|
+ MUL b2, c09, t1
|
|
+ MUL b2, c10, t2
|
|
+ MUL b2, c11, t3
|
|
+ MUL b2, c12, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL b3, c09, t1
|
|
+ MUL b3, c10, t2
|
|
+ MUL b3, c11, t3
|
|
+ MUL b3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c07, t3
|
|
+ MUL a2, c08, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c03, c03
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+
|
|
+ ST c03, 8 * SIZE(BO)
|
|
+ ST c07, 9 * SIZE(BO)
|
|
+ ST c11, 10 * SIZE(BO)
|
|
+ ST c15, 11 * SIZE(BO)
|
|
+
|
|
+ ST c04, 12 * SIZE(BO)
|
|
+ ST c08, 13 * SIZE(BO)
|
|
+ ST c12, 14 * SIZE(BO)
|
|
+ ST c16, 15 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+
|
|
+ ST c09, 8 * SIZE(AO)
|
|
+ ST c10, 9 * SIZE(AO)
|
|
+ ST c11, 10 * SIZE(AO)
|
|
+ ST c12, 11 * SIZE(AO)
|
|
+
|
|
+ ST c13, 12 * SIZE(AO)
|
|
+ ST c14, 13 * SIZE(AO)
|
|
+ ST c15, 14 * SIZE(AO)
|
|
+ ST c16, 15 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+ ldi C3, -4 * SIZE(C3)
|
|
+ ldi C4, -4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c11, 2 * SIZE(C3)
|
|
+ ST c12, 3 * SIZE(C3)
|
|
+
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+ ST c15, 2 * SIZE(C4)
|
|
+ ST c16, 3 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+ ldi C3, 4 * SIZE(C3)
|
|
+ ldi C4, 4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L39:
|
|
+#ifdef LN
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ and N, 2, J
|
|
+ ble J, $L80
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ addl LDC, LDC, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ fclr t1
|
|
+#ifndef RT
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ and M, 1, I
|
|
+ ble I, $L60
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L72:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t3, c02
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ LD b4, 5 * SIZE(BO)
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L72
|
|
+ .align 4
|
|
+
|
|
+$L75:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L77
|
|
+#else
|
|
+ blbs TMP1, $L77
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L77:
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ ADD c02, t3, c02
|
|
+ ADD c06, t4, c06
|
|
+
|
|
+ ADD c01, c02, c01
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ADD c05, c06, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c05, t2, c05
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L78:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a2, c05, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L60:
|
|
+ and M, 2, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L62:
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L62
|
|
+ .align 4
|
|
+
|
|
+$L65:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L67
|
|
+#else
|
|
+ blbs TMP1, $L67
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L67:
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD c05, t3, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c05, t3, c05
|
|
+ ADD c06, t4, c06
|
|
+ .align 4
|
|
+
|
|
+$L68:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c02, c02
|
|
+ SUB a4, c06, c06
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c05, c05
|
|
+ SUB a4, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L70:
|
|
+ sra M, 2, I
|
|
+ ble I, $L79
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble KK, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD c05, t1, c05
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD c03, t3, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ ADD c06, t2, c06
|
|
+ ADD c07, t3, c07
|
|
+ ADD c08, t4, c08
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c02, c02
|
|
+ SUB a4, c06, c06
|
|
+
|
|
+ SUB b1, c03, c03
|
|
+ SUB b2, c07, c07
|
|
+ SUB b3, c04, c04
|
|
+ SUB b4, c08, c08
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c05, c05
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c07, c07
|
|
+ SUB b4, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c08, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL a3, c04, t1
|
|
+ MUL a3, c08, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c08, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b1, c07, c07
|
|
+
|
|
+ MUL b2, c03, t1
|
|
+ MUL b2, c07, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL b3, c03, t1
|
|
+ MUL b3, c07, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c05, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c05, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b1, c06, c06
|
|
+
|
|
+ MUL b2, c02, t1
|
|
+ MUL b2, c06, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL b3, c02, t1
|
|
+ MUL b3, c06, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c07, c07
|
|
+
|
|
+ MUL a2, c03, t1
|
|
+ MUL a2, c07, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ MUL a3, c04, c04
|
|
+ MUL a3, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+ MUL a2, c03, t3
|
|
+ MUL a2, c04, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c06, c06
|
|
+ MUL a3, c07, c07
|
|
+ MUL a3, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c07, t3
|
|
+ MUL a2, c08, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c03, c03
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c07, 5 * SIZE(BO)
|
|
+ ST c04, 6 * SIZE(BO)
|
|
+ ST c08, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L79:
|
|
+#ifdef LN
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L80:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+#ifndef RT
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ and M, 1, I
|
|
+ ble I, $L100
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L112:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b4, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ bgt L, $L112
|
|
+ .align 4
|
|
+
|
|
+$L115:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L118
|
|
+ .align 4
|
|
+
|
|
+$L116:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L116
|
|
+ .align 4
|
|
+
|
|
+$L118:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+ ADD c01, c02, c01
|
|
+ ADD c03, c04, c03
|
|
+ ADD c01, c03, c01
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ SXADDQ K, AORIG, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L100:
|
|
+ and M, 2, I
|
|
+ ble I, $L110
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L105
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ ble L, $L105
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L102:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 7 * SIZE(AO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L102
|
|
+ .align 4
|
|
+
|
|
+$L105:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L108
|
|
+ .align 4
|
|
+
|
|
+$L106:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ unop
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L106
|
|
+ .align 4
|
|
+
|
|
+$L108:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+ ADD c01, c03, c01
|
|
+ ADD c02, c04, c02
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a2, c02, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L110:
|
|
+ sra M, 2, I
|
|
+ ble I, $L119
|
|
+ .align 4
|
|
+
|
|
+$L91:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L95
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L92:
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi L, -1(L)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 12 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD a2, 13 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 14 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b3, t4
|
|
+ LD a5, 15 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, 16 * SIZE(AO)
|
|
+ ldi AO, 16 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L92
|
|
+ .align 4
|
|
+
|
|
+$L95:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L98
|
|
+ .align 4
|
|
+
|
|
+$L96:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ bgt L, $L96
|
|
+ .align 4
|
|
+
|
|
+$L98:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a2, c04, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL a3, c04, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a4, c04, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b2, c03, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL b3, c03, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a2, c02, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a3, c01, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL a4, c01, t1
|
|
+ SUB c04, t1, c04
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b2, c02, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL b3, c02, t1
|
|
+ SUB c04, t1, c04
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a2, c03, t1
|
|
+ SUB c04, t1, c04
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L91
|
|
+ .align 4
|
|
+
|
|
+$L119:
|
|
+#ifdef LN
|
|
+ SXADDQ K, B, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S b/kernel/sw_64/trsm_kernel_4x4_LT.S
|
|
new file mode 100644
|
|
index 0000000..54f8a51
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/trsm_kernel_4x4_LT.S
|
|
@@ -0,0 +1,5145 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 88
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $22
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+#define C3 $25
|
|
+#define C4 $27
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define tmp $9
|
|
+
|
|
+#define alpha $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl C, 0 + STACKSIZE($sp)
|
|
+ ldl LDC, 8 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp)
|
|
+
|
|
+ SXADDQ LDC, 0, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ stl $9, 64($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ mulq M, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ M, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negl OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mulq N, K, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mulq N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 2, J
|
|
+ ble J, $L40
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ s4addl LDC, 0, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C3
|
|
+#ifndef RT
|
|
+ s4addl LDC, C, C
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ addl C3, LDC, C4
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ flds $f31, 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble KK, $L18
|
|
+#else
|
|
+
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble TMP1, $L18
|
|
+#endif
|
|
+
|
|
+ ble L, $L15
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+/* 2 */
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ldi L, -2(L)
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, a1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL b1, a4, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, b5
|
|
+ fmov b5, t3
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, b5
|
|
+ fmov b5, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, b5
|
|
+ fmov b5, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, b5
|
|
+ fmov b5, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, b5
|
|
+ fmov b5, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, a4, b5
|
|
+ fmov b5, t2
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, a1, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL b3, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ MUL b4, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL b2, a3, b5
|
|
+ fmov b5, t4
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL b3, a4, b5
|
|
+ fmov b5, t2
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ MUL b4, a4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ SUB b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD a1, 8 * SIZE(BO)
|
|
+ LD a2, 9 * SIZE(BO)
|
|
+ LD a3, 10 * SIZE(BO)
|
|
+ LD a4, 11 * SIZE(BO)
|
|
+
|
|
+ LD b1, 12 * SIZE(BO)
|
|
+ LD b2, 13 * SIZE(BO)
|
|
+ LD b3, 14 * SIZE(BO)
|
|
+ LD b4, 15 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a2, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB a3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB a4, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ SUB b1, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b2, c08, b5
|
|
+ fmov b5, c08
|
|
+ SUB b3, c12, b5
|
|
+ fmov b5, c12
|
|
+ SUB b4, c16, b5
|
|
+ fmov b5, c16
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+
|
|
+ LD b1, 12 * SIZE(AO)
|
|
+ LD b2, 13 * SIZE(AO)
|
|
+ LD b3, 14 * SIZE(AO)
|
|
+ LD b4, 15 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB a3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB a4, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ SUB b1, c13, b5
|
|
+ fmov b5, c13
|
|
+ SUB b2, c14, b5
|
|
+ fmov b5, c14
|
|
+ SUB b3, c15, b5
|
|
+ fmov b5, c15
|
|
+ SUB b4, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+ MUL a1, c16, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL b1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL b1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+ MUL a3, c12, b5
|
|
+ fmov b5, c12
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL b1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+ MUL a3, c15, b5
|
|
+ fmov b5, c15
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+ MUL a1, c15, b5
|
|
+ fmov b5, c15
|
|
+ MUL a1, c16, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL b1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+
|
|
+ ST c03, 8 * SIZE(BO)
|
|
+ ST c07, 9 * SIZE(BO)
|
|
+ ST c11, 10 * SIZE(BO)
|
|
+ ST c15, 11 * SIZE(BO)
|
|
+
|
|
+ ST c04, 12 * SIZE(BO)
|
|
+ ST c08, 13 * SIZE(BO)
|
|
+ ST c12, 14 * SIZE(BO)
|
|
+ ST c16, 15 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+
|
|
+ ST c09, 8 * SIZE(AO)
|
|
+ ST c10, 9 * SIZE(AO)
|
|
+ ST c11, 10 * SIZE(AO)
|
|
+ ST c12, 11 * SIZE(AO)
|
|
+
|
|
+ ST c13, 12 * SIZE(AO)
|
|
+ ST c14, 13 * SIZE(AO)
|
|
+ ST c15, 14 * SIZE(AO)
|
|
+ ST c16, 15 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+ ldi C3, -4 * SIZE(C3)
|
|
+ ldi C4, -4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c11, 2 * SIZE(C3)
|
|
+ ST c12, 3 * SIZE(C3)
|
|
+
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+ ST c15, 2 * SIZE(C4)
|
|
+ ST c16, 3 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+ ldi C3, 4 * SIZE(C3)
|
|
+ ldi C4, 4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 2, I
|
|
+ ble I, $L30
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble KK, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble TMP1, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ unop
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ SUB b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ SUB b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB b2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b3, c13, b5
|
|
+ fmov b5, c13
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a3, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c13, 6 * SIZE(AO)
|
|
+ ST c14, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+ ldi C3, -2 * SIZE(C3)
|
|
+ ldi C4, -2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+ ldi C3, 2 * SIZE(C3)
|
|
+ ldi C4, 2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and M, 1, I
|
|
+ ble I, $L39
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ ble KK, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ ble TMP1, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5,c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ LD b5, 3 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, b1, t1
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, b2, t2
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a2, b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ bgt L, $L32
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L37
|
|
+#else
|
|
+ blbs TMP1, $L37
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+$L38:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c13, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+ ldi C3, -1 * SIZE(C3)
|
|
+ ldi C4, -1 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L39:
|
|
+#ifdef LN
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ and N, 2, J
|
|
+ ble J, $L80
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ addl LDC, LDC, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ fclr t1
|
|
+#ifndef RT
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L60
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble KK, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ unop
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a5, b3, b5
|
|
+ fmov b5, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ SUB b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB b2, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b3, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a3, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c07, 5 * SIZE(BO)
|
|
+ ST c04, 6 * SIZE(BO)
|
|
+ ST c08, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L60:
|
|
+ and M, 2, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L62:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a4, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L62
|
|
+ .align 4
|
|
+
|
|
+$L65:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L67
|
|
+#else
|
|
+ blbs TMP1, $L67
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L67:
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ .align 4
|
|
+
|
|
+$L68:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L70:
|
|
+ and M, 1, I
|
|
+ ble I, $L79
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L72:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t3, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD b3, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ LD b4, 5 * SIZE(BO)
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L72
|
|
+ .align 4
|
|
+
|
|
+$L75:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L77
|
|
+#else
|
|
+ blbs TMP1, $L77
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L77:
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c02, t3, b5
|
|
+ fmov b5, c02
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ ADD c01, c02, b5
|
|
+ fmov b5, c01
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ADD c05, c06, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L78:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L79:
|
|
+#ifdef LN
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L80:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+#ifndef RT
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L100
|
|
+ .align 4
|
|
+
|
|
+$L91:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L95
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L92:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi L, -1(L)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 12 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 13 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 14 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b3, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 15 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 16 * SIZE(AO)
|
|
+ ldi AO, 16 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L92
|
|
+ .align 4
|
|
+
|
|
+$L95:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L98
|
|
+ .align 4
|
|
+
|
|
+$L96:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ bgt L, $L96
|
|
+ .align 4
|
|
+
|
|
+$L98:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L91
|
|
+ .align 4
|
|
+
|
|
+$L100:
|
|
+ and M, 2, I
|
|
+ ble I, $L110
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L105
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ ble L, $L105
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L102:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 7 * SIZE(AO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L102
|
|
+ .align 4
|
|
+
|
|
+$L105:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L108
|
|
+ .align 4
|
|
+
|
|
+$L106:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ unop
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L106
|
|
+ .align 4
|
|
+
|
|
+$L108:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c01, c03, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c04, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L110:
|
|
+ and M, 1, I
|
|
+ ble I, $L119
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L112:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ bgt L, $L112
|
|
+ .align 4
|
|
+
|
|
+$L115:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L118
|
|
+ .align 4
|
|
+
|
|
+$L116:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L116
|
|
+ .align 4
|
|
+
|
|
+$L118:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c01, c02, b5
|
|
+ fmov b5, c01
|
|
+ ADD c03, c04, b5
|
|
+ fmov b5, c03
|
|
+ ADD c01, c03, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ SXADDQ K, AORIG, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L119:
|
|
+#ifdef LN
|
|
+ SXADDQ K, B, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl $9, 64($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S.bak b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak
|
|
new file mode 100644
|
|
index 0000000..86136ae
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak
|
|
@@ -0,0 +1,4072 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW2B
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP nop
|
|
+#endif
|
|
+
|
|
+#ifdef EV6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $22
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+#define C3 $25
|
|
+#define C4 $27
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl C, 0 + STACKSIZE($sp)
|
|
+ ldl LDC, 8 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp)
|
|
+
|
|
+ SXADDQ LDC, 0, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ mull M, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ M, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negl OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 2, J
|
|
+ ble J, $L40
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ s4addl LDC, 0, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C3
|
|
+#ifndef RT
|
|
+ s4addl LDC, C, C
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ addl C3, LDC, C4
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble KK, $L18
|
|
+#else
|
|
+
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble TMP1, $L18
|
|
+#endif
|
|
+
|
|
+ ble L, $L15
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD c11, t1, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD c01, t1, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD c11, t1, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD c03, t1, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD c11, t1, c11
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c11, t1, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD c03, t1, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD c08, t3, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD c09, t1, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD c14, t3, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c11, t1, c11
|
|
+ ADD c12, t2, c12
|
|
+ ADD c16, t3, c16
|
|
+ ADD c15, t4, c15
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+
|
|
+ SUB b1, c02, c02
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c10, c10
|
|
+ SUB b4, c14, c14
|
|
+
|
|
+ LD a1, 8 * SIZE(BO)
|
|
+ LD a2, 9 * SIZE(BO)
|
|
+ LD a3, 10 * SIZE(BO)
|
|
+ LD a4, 11 * SIZE(BO)
|
|
+
|
|
+ LD b1, 12 * SIZE(BO)
|
|
+ LD b2, 13 * SIZE(BO)
|
|
+ LD b3, 14 * SIZE(BO)
|
|
+ LD b4, 15 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c03, c03
|
|
+ SUB a2, c07, c07
|
|
+ SUB a3, c11, c11
|
|
+ SUB a4, c15, c15
|
|
+
|
|
+ SUB b1, c04, c04
|
|
+ SUB b2, c08, c08
|
|
+ SUB b3, c12, c12
|
|
+ SUB b4, c16, c16
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c05, c05
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c07, c07
|
|
+ SUB b4, c08, c08
|
|
+
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+
|
|
+ LD b1, 12 * SIZE(AO)
|
|
+ LD b2, 13 * SIZE(AO)
|
|
+ LD b3, 14 * SIZE(AO)
|
|
+ LD b4, 15 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c09, c09
|
|
+ SUB a2, c10, c10
|
|
+ SUB a3, c11, c11
|
|
+ SUB a4, c12, c12
|
|
+
|
|
+ SUB b1, c13, c13
|
|
+ SUB b2, c14, c14
|
|
+ SUB b3, c15, c15
|
|
+ SUB b4, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c08, c08
|
|
+ MUL a1, c12, c12
|
|
+ MUL a1, c16, c16
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c08, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c16, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL a3, c04, t1
|
|
+ MUL a3, c08, t2
|
|
+ MUL a3, c12, t3
|
|
+ MUL a3, c16, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c08, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c16, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b1, c07, c07
|
|
+ MUL b1, c11, c11
|
|
+ MUL b1, c15, c15
|
|
+
|
|
+ MUL b2, c03, t1
|
|
+ MUL b2, c07, t2
|
|
+ MUL b2, c11, t3
|
|
+ MUL b2, c15, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL b3, c03, t1
|
|
+ MUL b3, c07, t2
|
|
+ MUL b3, c11, t3
|
|
+ MUL b3, c15, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c14, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c09, c09
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+ MUL a2, c09, t3
|
|
+ MUL a2, c13, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c05, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c13, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c05, t2
|
|
+ MUL a4, c09, t3
|
|
+ MUL a4, c13, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b1, c06, c06
|
|
+ MUL b1, c10, c10
|
|
+ MUL b1, c14, c14
|
|
+
|
|
+ MUL b2, c02, t1
|
|
+ MUL b2, c06, t2
|
|
+ MUL b2, c10, t3
|
|
+ MUL b2, c14, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL b3, c02, t1
|
|
+ MUL b3, c06, t2
|
|
+ MUL b3, c10, t3
|
|
+ MUL b3, c14, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c15, c15
|
|
+
|
|
+ MUL a2, c03, t1
|
|
+ MUL a2, c07, t2
|
|
+ MUL a2, c11, t3
|
|
+ MUL a2, c15, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ MUL a3, c04, c04
|
|
+ MUL a3, c08, c08
|
|
+ MUL a3, c12, c12
|
|
+ MUL a3, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+ MUL a2, c03, t3
|
|
+ MUL a2, c04, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c02, t2
|
|
+ MUL a4, c03, t3
|
|
+ MUL a4, c04, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b1, c06, c06
|
|
+ MUL b1, c07, c07
|
|
+ MUL b1, c08, c08
|
|
+
|
|
+ MUL b2, c05, t1
|
|
+ MUL b2, c06, t2
|
|
+ MUL b2, c07, t3
|
|
+ MUL b2, c08, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL b3, c05, t1
|
|
+ MUL b3, c06, t2
|
|
+ MUL b3, c07, t3
|
|
+ MUL b3, c08, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ MUL a2, c09, t1
|
|
+ MUL a2, c10, t2
|
|
+ MUL a2, c11, t3
|
|
+ MUL a2, c12, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ MUL a3, c13, c13
|
|
+ MUL a3, c14, c14
|
|
+ MUL a3, c15, c15
|
|
+ MUL a3, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a1, c14, c14
|
|
+ MUL a1, c15, c15
|
|
+ MUL a1, c16, c16
|
|
+
|
|
+ MUL a2, c13, t1
|
|
+ MUL a2, c14, t2
|
|
+ MUL a2, c15, t3
|
|
+ MUL a2, c16, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a3, c13, t1
|
|
+ MUL a3, c14, t2
|
|
+ MUL a3, c15, t3
|
|
+ MUL a3, c16, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a4, c13, t1
|
|
+ MUL a4, c14, t2
|
|
+ MUL a4, c15, t3
|
|
+ MUL a4, c16, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b1, c10, c10
|
|
+ MUL b1, c11, c11
|
|
+ MUL b1, c12, c12
|
|
+
|
|
+ MUL b2, c09, t1
|
|
+ MUL b2, c10, t2
|
|
+ MUL b2, c11, t3
|
|
+ MUL b2, c12, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL b3, c09, t1
|
|
+ MUL b3, c10, t2
|
|
+ MUL b3, c11, t3
|
|
+ MUL b3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c07, t3
|
|
+ MUL a2, c08, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c03, c03
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+
|
|
+ ST c03, 8 * SIZE(BO)
|
|
+ ST c07, 9 * SIZE(BO)
|
|
+ ST c11, 10 * SIZE(BO)
|
|
+ ST c15, 11 * SIZE(BO)
|
|
+
|
|
+ ST c04, 12 * SIZE(BO)
|
|
+ ST c08, 13 * SIZE(BO)
|
|
+ ST c12, 14 * SIZE(BO)
|
|
+ ST c16, 15 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+
|
|
+ ST c09, 8 * SIZE(AO)
|
|
+ ST c10, 9 * SIZE(AO)
|
|
+ ST c11, 10 * SIZE(AO)
|
|
+ ST c12, 11 * SIZE(AO)
|
|
+
|
|
+ ST c13, 12 * SIZE(AO)
|
|
+ ST c14, 13 * SIZE(AO)
|
|
+ ST c15, 14 * SIZE(AO)
|
|
+ ST c16, 15 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+ ldi C3, -4 * SIZE(C3)
|
|
+ ldi C4, -4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c11, 2 * SIZE(C3)
|
|
+ ST c12, 3 * SIZE(C3)
|
|
+
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+ ST c15, 2 * SIZE(C4)
|
|
+ ST c16, 3 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+ ldi C3, 4 * SIZE(C3)
|
|
+ ldi C4, 4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 2, I
|
|
+ ble I, $L30
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble KK, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble TMP1, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD c09, t1, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD c10, t2, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD c13, t3, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD c05, t3, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ ADD c10, t2, c10
|
|
+ ADD c13, t3, c13
|
|
+ ADD c14, t4, c14
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+
|
|
+ SUB b1, c02, c02
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c10, c10
|
|
+ SUB b4, c14, c14
|
|
+
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c05, c05
|
|
+ SUB a4, c06, c06
|
|
+
|
|
+ SUB b1, c09, c09
|
|
+ SUB b2, c10, c10
|
|
+ SUB b3, c13, c13
|
|
+ SUB b4, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c14, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c09, c09
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+ MUL a2, c09, t3
|
|
+ MUL a2, c13, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c06, c06
|
|
+ MUL a3, c10, c10
|
|
+ MUL a3, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c02, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b1, c06, c06
|
|
+
|
|
+ MUL b2, c05, t1
|
|
+ MUL b2, c06, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL b3, c05, t1
|
|
+ MUL b3, c06, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ MUL a2, c09, t1
|
|
+ MUL a2, c10, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ MUL a3, c13, c13
|
|
+ MUL a3, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c13, t1
|
|
+ MUL a2, c14, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a3, c13, t1
|
|
+ MUL a3, c14, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a4, c13, t1
|
|
+ MUL a4, c14, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b1, c10, c10
|
|
+
|
|
+ MUL b2, c09, t1
|
|
+ MUL b2, c10, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL b3, c09, t1
|
|
+ MUL b3, c10, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c13, 6 * SIZE(AO)
|
|
+ ST c14, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+ ldi C3, -2 * SIZE(C3)
|
|
+ ldi C4, -2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+ ldi C3, 2 * SIZE(C3)
|
|
+ ldi C4, 2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and M, 1, I
|
|
+ ble I, $L39
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ ble KK, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ ble TMP1, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ LD b5, 3 * SIZE(BO)
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a2, b1, t1
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a2, b2, t2
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a2, b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ bgt L, $L32
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L37
|
|
+#else
|
|
+ blbs TMP1, $L37
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ ADD c09, t3, c09
|
|
+ MUL a1, b3, t3
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c05, t2, c05
|
|
+ ADD c09, t3, c09
|
|
+ ADD c13, t4, c13
|
|
+
|
|
+$L38:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a3, c01, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL a4, c01, t1
|
|
+ SUB c13, t1, c13
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b2, c05, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL b3, c05, t1
|
|
+ SUB c13, t1, c13
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a2, c09, t1
|
|
+ SUB c13, t1, c13
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a2, c13, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL a3, c13, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a4, c13, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b2, c09, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL b3, c09, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a2, c05, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c13, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+ ldi C3, -1 * SIZE(C3)
|
|
+ ldi C4, -1 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L39:
|
|
+#ifdef LN
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ and N, 2, J
|
|
+ ble J, $L80
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ addl LDC, LDC, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ fclr t1
|
|
+#ifndef RT
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L60
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble KK, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD c05, t1, c05
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD c03, t3, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ ADD c06, t2, c06
|
|
+ ADD c07, t3, c07
|
|
+ ADD c08, t4, c08
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c02, c02
|
|
+ SUB a4, c06, c06
|
|
+
|
|
+ SUB b1, c03, c03
|
|
+ SUB b2, c07, c07
|
|
+ SUB b3, c04, c04
|
|
+ SUB b4, c08, c08
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c05, c05
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c07, c07
|
|
+ SUB b4, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c08, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL a3, c04, t1
|
|
+ MUL a3, c08, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c08, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b1, c07, c07
|
|
+
|
|
+ MUL b2, c03, t1
|
|
+ MUL b2, c07, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL b3, c03, t1
|
|
+ MUL b3, c07, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c05, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c05, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b1, c06, c06
|
|
+
|
|
+ MUL b2, c02, t1
|
|
+ MUL b2, c06, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL b3, c02, t1
|
|
+ MUL b3, c06, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c07, c07
|
|
+
|
|
+ MUL a2, c03, t1
|
|
+ MUL a2, c07, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ MUL a3, c04, c04
|
|
+ MUL a3, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+ MUL a2, c03, t3
|
|
+ MUL a2, c04, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c06, c06
|
|
+ MUL a3, c07, c07
|
|
+ MUL a3, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c07, t3
|
|
+ MUL a2, c08, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c03, c03
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c07, 5 * SIZE(BO)
|
|
+ ST c04, 6 * SIZE(BO)
|
|
+ ST c08, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L60:
|
|
+ and M, 2, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L62:
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L62
|
|
+ .align 4
|
|
+
|
|
+$L65:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L67
|
|
+#else
|
|
+ blbs TMP1, $L67
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L67:
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD c05, t3, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c05, t3, c05
|
|
+ ADD c06, t4, c06
|
|
+ .align 4
|
|
+
|
|
+$L68:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c02, c02
|
|
+ SUB a4, c06, c06
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c05, c05
|
|
+ SUB a4, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L70:
|
|
+ and M, 1, I
|
|
+ ble I, $L79
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L72:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t3, c02
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ LD b4, 5 * SIZE(BO)
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L72
|
|
+ .align 4
|
|
+
|
|
+$L75:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L77
|
|
+#else
|
|
+ blbs TMP1, $L77
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L77:
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ ADD c02, t3, c02
|
|
+ ADD c06, t4, c06
|
|
+
|
|
+ ADD c01, c02, c01
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ADD c05, c06, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c05, t2, c05
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L78:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a2, c05, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L79:
|
|
+#ifdef LN
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L80:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+#ifndef RT
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L100
|
|
+ .align 4
|
|
+
|
|
+$L91:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L95
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L92:
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi L, -1(L)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 12 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD a2, 13 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 14 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b3, t4
|
|
+ LD a5, 15 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, 16 * SIZE(AO)
|
|
+ ldi AO, 16 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L92
|
|
+ .align 4
|
|
+
|
|
+$L95:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L98
|
|
+ .align 4
|
|
+
|
|
+$L96:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ bgt L, $L96
|
|
+ .align 4
|
|
+
|
|
+$L98:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a2, c04, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL a3, c04, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a4, c04, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b2, c03, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL b3, c03, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a2, c02, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a3, c01, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL a4, c01, t1
|
|
+ SUB c04, t1, c04
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b2, c02, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL b3, c02, t1
|
|
+ SUB c04, t1, c04
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a2, c03, t1
|
|
+ SUB c04, t1, c04
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L91
|
|
+ .align 4
|
|
+
|
|
+$L100:
|
|
+ and M, 2, I
|
|
+ ble I, $L110
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L105
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ ble L, $L105
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L102:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 7 * SIZE(AO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L102
|
|
+ .align 4
|
|
+
|
|
+$L105:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L108
|
|
+ .align 4
|
|
+
|
|
+$L106:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ unop
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L106
|
|
+ .align 4
|
|
+
|
|
+$L108:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+ ADD c01, c03, c01
|
|
+ ADD c02, c04, c02
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a2, c02, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L110:
|
|
+ and M, 1, I
|
|
+ ble I, $L119
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L112:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b4, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ bgt L, $L112
|
|
+ .align 4
|
|
+
|
|
+$L115:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L118
|
|
+ .align 4
|
|
+
|
|
+$L116:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L116
|
|
+ .align 4
|
|
+
|
|
+$L118:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+ ADD c01, c02, c01
|
|
+ ADD c03, c04, c03
|
|
+ ADD c01, c03, c01
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ SXADDQ K, AORIG, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L119:
|
|
+#ifdef LN
|
|
+ SXADDQ K, B, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S b/kernel/sw_64/trsm_kernel_4x4_RT.S
|
|
new file mode 100644
|
|
index 0000000..b9a1975
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/trsm_kernel_4x4_RT.S
|
|
@@ -0,0 +1,5148 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 88
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $22
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+#define C3 $25
|
|
+#define C4 $27
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define tmp $9
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl C, 0 + STACKSIZE($sp)
|
|
+ ldl LDC, 8 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp)
|
|
+
|
|
+ SXADDQ LDC, 0, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ stl $9, 64($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ mulq M, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ M, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negq OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ and N, 1, J
|
|
+ ble J, $L40
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+#ifndef RT
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L100
|
|
+ .align 4
|
|
+
|
|
+$L91:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L95
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L92:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi L, -1(L)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 12 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 13 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 14 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b3, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 15 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 16 * SIZE(AO)
|
|
+ ldi AO, 16 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L92
|
|
+ .align 4
|
|
+
|
|
+$L95:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L98
|
|
+ .align 4
|
|
+
|
|
+$L96:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ bgt L, $L96
|
|
+ .align 4
|
|
+
|
|
+$L98:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L91
|
|
+ .align 4
|
|
+
|
|
+$L100:
|
|
+ and M, 2, I
|
|
+ ble I, $L110
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L105
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ ble L, $L105
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L102:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 7 * SIZE(AO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L102
|
|
+ .align 4
|
|
+
|
|
+$L105:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L108
|
|
+ .align 4
|
|
+
|
|
+$L106:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ unop
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L106
|
|
+ .align 4
|
|
+
|
|
+$L108:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c01, c03, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c04, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L110:
|
|
+ and M, 1, I
|
|
+ ble I, $L119
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L112:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ bgt L, $L112
|
|
+ .align 4
|
|
+
|
|
+$L115:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L118
|
|
+ .align 4
|
|
+
|
|
+$L116:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L116
|
|
+ .align 4
|
|
+
|
|
+$L118:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c01, c02, b5
|
|
+ fmov b5, c01
|
|
+ ADD c03, c04, b5
|
|
+ fmov b5, c03
|
|
+ ADD c01, c03, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ SXADDQ K, AORIG, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L119:
|
|
+#ifdef LN
|
|
+ SXADDQ K, B, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ and N, 2, J
|
|
+ ble J, $L80
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ addl LDC, LDC, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ fclr t1
|
|
+#ifndef RT
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L60
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble KK, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+ unop
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a5, b3, b5
|
|
+ fmov b5, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ MUL a4, b1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ ADD c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ADD c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ ADD c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ SUB b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB b2, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b3, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c08, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a3, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c07, 5 * SIZE(BO)
|
|
+ ST c04, 6 * SIZE(BO)
|
|
+ ST c08, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L60:
|
|
+ and M, 2, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L62:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a3, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a4, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L62
|
|
+ .align 4
|
|
+
|
|
+$L65:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L67
|
|
+#else
|
|
+ blbs TMP1, $L67
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L67:
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ .align 4
|
|
+
|
|
+$L68:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L70:
|
|
+ and M, 1, I
|
|
+ ble I, $L79
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L72:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t3, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD b3, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ LD b4, 5 * SIZE(BO)
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L72
|
|
+ .align 4
|
|
+
|
|
+$L75:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L77
|
|
+#else
|
|
+ blbs TMP1, $L77
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L77:
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c02, t3, b5
|
|
+ fmov b5, c02
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ ADD c01, c02, b5
|
|
+ fmov b5, c01
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ADD c05, c06, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L78:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L79:
|
|
+#ifdef LN
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L80:
|
|
+ sra N, 2, J
|
|
+ ble J, $L999
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ s4addl LDC, 0, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C3
|
|
+#ifndef RT
|
|
+ s4addl LDC, C, C
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ addl C3, LDC, C4
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ flds $f31, 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble KK, $L18
|
|
+#else
|
|
+
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble TMP1, $L18
|
|
+#endif
|
|
+
|
|
+ ble L, $L15
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, b5
|
|
+ fmov b5, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+/* 2 */
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ldi L, -2(L)
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, a1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL b1, a4, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, b5
|
|
+ fmov b5, t3
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, b5
|
|
+ fmov b5, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, b5
|
|
+ fmov b5, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, b5
|
|
+ fmov b5, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, b5
|
|
+ fmov b5, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, b5
|
|
+ fmov b5, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, b5
|
|
+ fmov b5, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, a4, b5
|
|
+ fmov b5, t2
|
|
+ ADD c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, b5
|
|
+ fmov b5, t4
|
|
+ ADD c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, a1, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL b3, a2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ MUL b4, a2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL b2, a3, b5
|
|
+ fmov b5, t4
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, a3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL b3, a4, b5
|
|
+ fmov b5, t2
|
|
+ ADD c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ MUL b4, a4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ ADD c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ADD c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ ADD c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ SUB b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD a1, 8 * SIZE(BO)
|
|
+ LD a2, 9 * SIZE(BO)
|
|
+ LD a3, 10 * SIZE(BO)
|
|
+ LD a4, 11 * SIZE(BO)
|
|
+
|
|
+ LD b1, 12 * SIZE(BO)
|
|
+ LD b2, 13 * SIZE(BO)
|
|
+ LD b3, 14 * SIZE(BO)
|
|
+ LD b4, 15 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a2, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB a3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB a4, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ SUB b1, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b2, c08, b5
|
|
+ fmov b5, c08
|
|
+ SUB b3, c12, b5
|
|
+ fmov b5, c12
|
|
+ SUB b4, c16, b5
|
|
+ fmov b5, c16
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c07, b5
|
|
+ fmov b5, c07
|
|
+ SUB b4, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+
|
|
+ LD b1, 12 * SIZE(AO)
|
|
+ LD b2, 13 * SIZE(AO)
|
|
+ LD b3, 14 * SIZE(AO)
|
|
+ LD b4, 15 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB a3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB a4, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ SUB b1, c13, b5
|
|
+ fmov b5, c13
|
|
+ SUB b2, c14, b5
|
|
+ fmov b5, c14
|
|
+ SUB b3, c15, b5
|
|
+ fmov b5, c15
|
|
+ SUB b4, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+ MUL a1, c16, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c08, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c12, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL b1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL b2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL b3, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL b1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL b2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c07, t2, b5
|
|
+ fmov b5, c07
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL b3, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c15, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c15, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c04, t1, b5
|
|
+ fmov b5, c04
|
|
+ SUB c08, t2, b5
|
|
+ fmov b5, c08
|
|
+ SUB c12, t3, b5
|
|
+ fmov b5, c12
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c08, b5
|
|
+ fmov b5, c08
|
|
+ MUL a3, c12, b5
|
|
+ fmov b5, c12
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c02, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c03, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c04, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL b1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL b1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+ SUB c15, t3, b5
|
|
+ fmov b5, c15
|
|
+ SUB c16, t4, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+ MUL a3, c15, b5
|
|
+ fmov b5, c15
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+ MUL a1, c15, b5
|
|
+ fmov b5, c15
|
|
+ MUL a1, c16, b5
|
|
+ fmov b5, c16
|
|
+
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a3, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a3, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c14, b5
|
|
+ fmov b5, t2
|
|
+ MUL a4, c15, b5
|
|
+ fmov b5, t3
|
|
+ MUL a4, c16, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL b1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL b1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL b2, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b2, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ SUB c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t2
|
|
+ MUL b3, c11, b5
|
|
+ fmov b5, t3
|
|
+ MUL b3, c12, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c07, b5
|
|
+ fmov b5, c07
|
|
+ MUL a1, c08, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c07, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c08, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+
|
|
+ ST c03, 8 * SIZE(BO)
|
|
+ ST c07, 9 * SIZE(BO)
|
|
+ ST c11, 10 * SIZE(BO)
|
|
+ ST c15, 11 * SIZE(BO)
|
|
+
|
|
+ ST c04, 12 * SIZE(BO)
|
|
+ ST c08, 13 * SIZE(BO)
|
|
+ ST c12, 14 * SIZE(BO)
|
|
+ ST c16, 15 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+
|
|
+ ST c09, 8 * SIZE(AO)
|
|
+ ST c10, 9 * SIZE(AO)
|
|
+ ST c11, 10 * SIZE(AO)
|
|
+ ST c12, 11 * SIZE(AO)
|
|
+
|
|
+ ST c13, 12 * SIZE(AO)
|
|
+ ST c14, 13 * SIZE(AO)
|
|
+ ST c15, 14 * SIZE(AO)
|
|
+ ST c16, 15 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+ ldi C3, -4 * SIZE(C3)
|
|
+ ldi C4, -4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c11, 2 * SIZE(C3)
|
|
+ ST c12, 3 * SIZE(C3)
|
|
+
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+ ST c15, 2 * SIZE(C4)
|
|
+ ST c16, 3 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+ ldi C3, 4 * SIZE(C3)
|
|
+ ldi C4, 4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 2, I
|
|
+ ble I, $L30
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble KK, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble TMP1, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L25:
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+ unop
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL a2, b1, b5
|
|
+ fmov b5, t2
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a2, b2, b5
|
|
+ fmov b5, t4
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t1
|
|
+
|
|
+ ADD c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, b5
|
|
+ fmov b5, t2
|
|
+ ADD c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ ADD c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ SUB b1, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB b2, c06, b5
|
|
+ fmov b5, c06
|
|
+ SUB b3, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a4, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ SUB b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB b2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b3, c13, b5
|
|
+ fmov b5, c13
|
|
+ SUB b4, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t2
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t3
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t4
|
|
+
|
|
+ SUB c02, t1, b5
|
|
+ fmov b5, c02
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ SUB c10, t3, b5
|
|
+ fmov b5, c10
|
|
+ SUB c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c06, b5
|
|
+ fmov b5, c06
|
|
+ MUL a3, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c02, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ SUB c14, t2, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, c14, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a3, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ MUL a4, c14, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b2, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ SUB c06, t2, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ MUL b3, c10, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c06, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ MUL a2, c06, b5
|
|
+ fmov b5, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c13, 6 * SIZE(AO)
|
|
+ ST c14, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+ ldi C3, -2 * SIZE(C3)
|
|
+ ldi C4, -2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+ ldi C3, 2 * SIZE(C3)
|
|
+ ldi C4, 2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and M, 1, I
|
|
+ ble I, $L39
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ ble KK, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ ble TMP1, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ LD b5, 3 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, b1, t1
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, b2, t2
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a2, b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ bgt L, $L32
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L37
|
|
+#else
|
|
+ blbs TMP1, $L37
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t4
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, b5
|
|
+ fmov b5, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, b5
|
|
+ fmov b5, t2
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b3, b5
|
|
+ fmov b5, t3
|
|
+
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b4, b5
|
|
+ fmov b5, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD c05, t2, b5
|
|
+ fmov b5, c05
|
|
+ ADD c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD c13, t4, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+$L38:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c05, b5
|
|
+ fmov b5, c05
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a2, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a4, c01, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL b2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a2, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c13, t1, b5
|
|
+ fmov b5, c13
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, b5
|
|
+ fmov b5, c13
|
|
+ MUL a2, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a3, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL a4, c13, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL b2, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL b3, c09, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, b5
|
|
+ fmov b5, c05
|
|
+ MUL a2, c05, b5
|
|
+ fmov b5, t1
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a3, c01, b5
|
|
+ fmov b5, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c13, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+ ldi C3, -1 * SIZE(C3)
|
|
+ ldi C4, -1 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L39:
|
|
+#ifdef LN
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl $9, 64($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S.bak b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak
|
|
new file mode 100644
|
|
index 0000000..af57279
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak
|
|
@@ -0,0 +1,4072 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW2B
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP nop
|
|
+#endif
|
|
+
|
|
+#ifdef EV6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $20
|
|
+#define B $21
|
|
+#define C $22
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+#define C3 $25
|
|
+#define C4 $27
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl C, 0 + STACKSIZE($sp)
|
|
+ ldl LDC, 8 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 16 + STACKSIZE($sp)
|
|
+
|
|
+ SXADDQ LDC, 0, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ mull M, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ M, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negq OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ and N, 1, J
|
|
+ ble J, $L40
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+#ifndef RT
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L100
|
|
+ .align 4
|
|
+
|
|
+$L91:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L95
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L95
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L92:
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi L, -1(L)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 12 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD a2, 13 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 14 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b3, t4
|
|
+ LD a5, 15 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, 16 * SIZE(AO)
|
|
+ ldi AO, 16 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L92
|
|
+ .align 4
|
|
+
|
|
+$L95:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ unop
|
|
+ ble L, $L98
|
|
+ .align 4
|
|
+
|
|
+$L96:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b1, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ bgt L, $L96
|
|
+ .align 4
|
|
+
|
|
+$L98:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a2, c04, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL a3, c04, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a4, c04, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b2, c03, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL b3, c03, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a2, c02, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a3, c01, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL a4, c01, t1
|
|
+ SUB c04, t1, c04
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b2, c02, t1
|
|
+ SUB c03, t1, c03
|
|
+ MUL b3, c02, t1
|
|
+ SUB c04, t1, c04
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a2, c03, t1
|
|
+ SUB c04, t1, c04
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L91
|
|
+ .align 4
|
|
+
|
|
+$L100:
|
|
+ and M, 2, I
|
|
+ ble I, $L110
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ ble L, $L105
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ ble L, $L105
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L102:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 7 * SIZE(AO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ bgt L, $L102
|
|
+ .align 4
|
|
+
|
|
+$L105:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L108
|
|
+ .align 4
|
|
+
|
|
+$L106:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -1(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ unop
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L106
|
|
+ .align 4
|
|
+
|
|
+$L108:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+ ADD c01, c03, c01
|
|
+ ADD c02, c04, c02
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a2, c02, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c02, t1, c02
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L110:
|
|
+ and M, 1, I
|
|
+ ble I, $L119
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c04
|
|
+
|
|
+ sra KK, 2, L
|
|
+ mov B, BO
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c04
|
|
+
|
|
+ sra TMP1, 2, L
|
|
+ unop
|
|
+ ble L, $L115
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L112:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 5 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 6 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b4, t4
|
|
+ LD a4, 7 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ bgt L, $L112
|
|
+ .align 4
|
|
+
|
|
+$L115:
|
|
+#if defined(LT) || defined(RN)
|
|
+ and KK, 3, L
|
|
+#else
|
|
+ and TMP1, 3, L
|
|
+#endif
|
|
+ ble L, $L118
|
|
+ .align 4
|
|
+
|
|
+$L116:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b1, 1 * SIZE(BO)
|
|
+
|
|
+ ldi L, -1(L)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ldi BO, 1 * SIZE(BO)
|
|
+ bgt L, $L116
|
|
+ .align 4
|
|
+
|
|
+$L118:
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+ ADD c01, c02, c01
|
|
+ ADD c03, c04, c03
|
|
+ ADD c01, c03, c01
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ addl B, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 1 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ SXADDQ K, AORIG, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L119:
|
|
+#ifdef LN
|
|
+ SXADDQ K, B, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L40:
|
|
+ and N, 2, J
|
|
+ ble J, $L80
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ addl LDC, LDC, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ fclr t1
|
|
+#ifndef RT
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L60
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble KK, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c03
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c07
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c04
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L52:
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD c05, t1, c05
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD c08, t4, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD c03, t3, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD c04, t4, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t1, c05
|
|
+ ADD c06, t2, c06
|
|
+ ADD c07, t3, c07
|
|
+ ADD c08, t4, c08
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c02, c02
|
|
+ SUB a4, c06, c06
|
|
+
|
|
+ SUB b1, c03, c03
|
|
+ SUB b2, c07, c07
|
|
+ SUB b3, c04, c04
|
|
+ SUB b4, c08, c08
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c05, c05
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c07, c07
|
|
+ SUB b4, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c08, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL a3, c04, t1
|
|
+ MUL a3, c08, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c08, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b1, c07, c07
|
|
+
|
|
+ MUL b2, c03, t1
|
|
+ MUL b2, c07, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL b3, c03, t1
|
|
+ MUL b3, c07, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c05, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c05, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b1, c06, c06
|
|
+
|
|
+ MUL b2, c02, t1
|
|
+ MUL b2, c06, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+
|
|
+ MUL b3, c02, t1
|
|
+ MUL b3, c06, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c07, c07
|
|
+
|
|
+ MUL a2, c03, t1
|
|
+ MUL a2, c07, t2
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+
|
|
+ MUL a3, c04, c04
|
|
+ MUL a3, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+ MUL a2, c03, t3
|
|
+ MUL a2, c04, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c06, c06
|
|
+ MUL a3, c07, c07
|
|
+ MUL a3, c08, c08
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c07, t3
|
|
+ MUL a2, c08, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c03, c03
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c07, 5 * SIZE(BO)
|
|
+ ST c04, 6 * SIZE(BO)
|
|
+ ST c08, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L60:
|
|
+ and M, 2, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L68
|
|
+
|
|
+ ble L, $L65
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L62:
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L62
|
|
+ .align 4
|
|
+
|
|
+$L65:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L67
|
|
+#else
|
|
+ blbs TMP1, $L67
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L67:
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD c05, t3, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c02, t2, c02
|
|
+ ADD c05, t3, c05
|
|
+ ADD c06, t4, c06
|
|
+ .align 4
|
|
+
|
|
+$L68:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c02, c02
|
|
+ SUB a4, c06, c06
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c05, c05
|
|
+ SUB a4, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c06, c06
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c02, 2 * SIZE(BO)
|
|
+ ST c06, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L70:
|
|
+ and M, 1, I
|
|
+ ble I, $L79
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ble KK, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ble TMP1, $L78
|
|
+
|
|
+ ble L, $L75
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L72:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 1 * SIZE(AO)
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t3, c02
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ LD b4, 5 * SIZE(BO)
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L72
|
|
+ .align 4
|
|
+
|
|
+$L75:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L77
|
|
+#else
|
|
+ blbs TMP1, $L77
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L77:
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ ADD c02, t3, c02
|
|
+ ADD c06, t4, c06
|
|
+
|
|
+ ADD c01, c02, c01
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ ADD c05, c06, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c05, t2, c05
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L78:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a3, c05, c05
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 3 * SIZE(BO)
|
|
+ LD a2, 2 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a2, c05, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L79:
|
|
+#ifdef LN
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L80:
|
|
+ sra N, 2, J
|
|
+ ble J, $L999
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ s4addl LDC, 0, TMP1
|
|
+ subl C, TMP1, C
|
|
+#endif
|
|
+
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C3
|
|
+#ifndef RT
|
|
+ s4addl LDC, C, C
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ addl C3, LDC, C4
|
|
+ fclr t2
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble KK, $L18
|
|
+#else
|
|
+
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 2, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 2, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c11
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c12
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c16
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c02
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c06
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c03
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+
|
|
+ fillcs 7 * SIZE(C2)
|
|
+ fclr c08
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ fillcs 4 * SIZE(C3)
|
|
+ fclr c09
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c10
|
|
+
|
|
+ fillcs 7 * SIZE(C4)
|
|
+ fclr c14
|
|
+ fclr c07
|
|
+ ble TMP1, $L18
|
|
+#endif
|
|
+
|
|
+ ble L, $L15
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD c11, t1, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD c01, t1, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD c11, t1, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD c03, t1, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD c11, t1, c11
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c11, t1, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD c03, t1, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD c04, t2, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD c08, t3, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD c09, t1, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD c14, t3, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD c07, t4, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c11, t1, c11
|
|
+ ADD c12, t2, c12
|
|
+ ADD c16, t3, c16
|
|
+ ADD c15, t4, c15
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 4, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+
|
|
+ SUB b1, c02, c02
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c10, c10
|
|
+ SUB b4, c14, c14
|
|
+
|
|
+ LD a1, 8 * SIZE(BO)
|
|
+ LD a2, 9 * SIZE(BO)
|
|
+ LD a3, 10 * SIZE(BO)
|
|
+ LD a4, 11 * SIZE(BO)
|
|
+
|
|
+ LD b1, 12 * SIZE(BO)
|
|
+ LD b2, 13 * SIZE(BO)
|
|
+ LD b3, 14 * SIZE(BO)
|
|
+ LD b4, 15 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c03, c03
|
|
+ SUB a2, c07, c07
|
|
+ SUB a3, c11, c11
|
|
+ SUB a4, c15, c15
|
|
+
|
|
+ SUB b1, c04, c04
|
|
+ SUB b2, c08, c08
|
|
+ SUB b3, c12, c12
|
|
+ SUB b4, c16, c16
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c05, c05
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c07, c07
|
|
+ SUB b4, c08, c08
|
|
+
|
|
+ LD a1, 8 * SIZE(AO)
|
|
+ LD a2, 9 * SIZE(AO)
|
|
+ LD a3, 10 * SIZE(AO)
|
|
+ LD a4, 11 * SIZE(AO)
|
|
+
|
|
+ LD b1, 12 * SIZE(AO)
|
|
+ LD b2, 13 * SIZE(AO)
|
|
+ LD b3, 14 * SIZE(AO)
|
|
+ LD b4, 15 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c09, c09
|
|
+ SUB a2, c10, c10
|
|
+ SUB a3, c11, c11
|
|
+ SUB a4, c12, c12
|
|
+
|
|
+ SUB b1, c13, c13
|
|
+ SUB b2, c14, c14
|
|
+ SUB b3, c15, c15
|
|
+ SUB b4, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 15 * SIZE(AO)
|
|
+ LD a2, 14 * SIZE(AO)
|
|
+ LD a3, 13 * SIZE(AO)
|
|
+ LD a4, 12 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c08, c08
|
|
+ MUL a1, c12, c12
|
|
+ MUL a1, c16, c16
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c08, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c16, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL a3, c04, t1
|
|
+ MUL a3, c08, t2
|
|
+ MUL a3, c12, t3
|
|
+ MUL a3, c16, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c08, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c16, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ LD b1, 10 * SIZE(AO)
|
|
+ LD b2, 9 * SIZE(AO)
|
|
+ LD b3, 8 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c03, c03
|
|
+ MUL b1, c07, c07
|
|
+ MUL b1, c11, c11
|
|
+ MUL b1, c15, c15
|
|
+
|
|
+ MUL b2, c03, t1
|
|
+ MUL b2, c07, t2
|
|
+ MUL b2, c11, t3
|
|
+ MUL b2, c15, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL b3, c03, t1
|
|
+ MUL b3, c07, t2
|
|
+ MUL b3, c11, t3
|
|
+ MUL b3, c15, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ LD a1, 5 * SIZE(AO)
|
|
+ LD a2, 4 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c14, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c09, c09
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+ MUL a2, c09, t3
|
|
+ MUL a2, c13, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c05, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c13, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c05, t2
|
|
+ MUL a4, c09, t3
|
|
+ MUL a4, c13, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(AO)
|
|
+ LD b2, 6 * SIZE(AO)
|
|
+ LD b3, 7 * SIZE(AO)
|
|
+
|
|
+ MUL b1, c02, c02
|
|
+ MUL b1, c06, c06
|
|
+ MUL b1, c10, c10
|
|
+ MUL b1, c14, c14
|
|
+
|
|
+ MUL b2, c02, t1
|
|
+ MUL b2, c06, t2
|
|
+ MUL b2, c10, t3
|
|
+ MUL b2, c14, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c07, t2, c07
|
|
+ SUB c11, t3, c11
|
|
+ SUB c15, t4, c15
|
|
+
|
|
+ MUL b3, c02, t1
|
|
+ MUL b3, c06, t2
|
|
+ MUL b3, c10, t3
|
|
+ MUL b3, c14, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(AO)
|
|
+ LD a2, 11 * SIZE(AO)
|
|
+ LD a3, 15 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c15, c15
|
|
+
|
|
+ MUL a2, c03, t1
|
|
+ MUL a2, c07, t2
|
|
+ MUL a2, c11, t3
|
|
+ MUL a2, c15, t4
|
|
+
|
|
+ SUB c04, t1, c04
|
|
+ SUB c08, t2, c08
|
|
+ SUB c12, t3, c12
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ MUL a3, c04, c04
|
|
+ MUL a3, c08, c08
|
|
+ MUL a3, c12, c12
|
|
+ MUL a3, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+ MUL a2, c03, t3
|
|
+ MUL a2, c04, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c02, t2
|
|
+ MUL a4, c03, t3
|
|
+ MUL a4, c04, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b1, c06, c06
|
|
+ MUL b1, c07, c07
|
|
+ MUL b1, c08, c08
|
|
+
|
|
+ MUL b2, c05, t1
|
|
+ MUL b2, c06, t2
|
|
+ MUL b2, c07, t3
|
|
+ MUL b2, c08, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL b3, c05, t1
|
|
+ MUL b3, c06, t2
|
|
+ MUL b3, c07, t3
|
|
+ MUL b3, c08, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ MUL a2, c09, t1
|
|
+ MUL a2, c10, t2
|
|
+ MUL a2, c11, t3
|
|
+ MUL a2, c12, t4
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+ SUB c15, t3, c15
|
|
+ SUB c16, t4, c16
|
|
+
|
|
+ MUL a3, c13, c13
|
|
+ MUL a3, c14, c14
|
|
+ MUL a3, c15, c15
|
|
+ MUL a3, c16, c16
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a1, c14, c14
|
|
+ MUL a1, c15, c15
|
|
+ MUL a1, c16, c16
|
|
+
|
|
+ MUL a2, c13, t1
|
|
+ MUL a2, c14, t2
|
|
+ MUL a2, c15, t3
|
|
+ MUL a2, c16, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a3, c13, t1
|
|
+ MUL a3, c14, t2
|
|
+ MUL a3, c15, t3
|
|
+ MUL a3, c16, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL a4, c13, t1
|
|
+ MUL a4, c14, t2
|
|
+ MUL a4, c15, t3
|
|
+ MUL a4, c16, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b1, c10, c10
|
|
+ MUL b1, c11, c11
|
|
+ MUL b1, c12, c12
|
|
+
|
|
+ MUL b2, c09, t1
|
|
+ MUL b2, c10, t2
|
|
+ MUL b2, c11, t3
|
|
+ MUL b2, c12, t4
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+ SUB c07, t3, c07
|
|
+ SUB c08, t4, c08
|
|
+
|
|
+ MUL b3, c09, t1
|
|
+ MUL b3, c10, t2
|
|
+ MUL b3, c11, t3
|
|
+ MUL b3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c07, c07
|
|
+ MUL a1, c08, c08
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c07, t3
|
|
+ MUL a2, c08, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c03, c03
|
|
+ MUL a3, c04, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+
|
|
+ ST c03, 8 * SIZE(BO)
|
|
+ ST c07, 9 * SIZE(BO)
|
|
+ ST c11, 10 * SIZE(BO)
|
|
+ ST c15, 11 * SIZE(BO)
|
|
+
|
|
+ ST c04, 12 * SIZE(BO)
|
|
+ ST c08, 13 * SIZE(BO)
|
|
+ ST c12, 14 * SIZE(BO)
|
|
+ ST c16, 15 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c05, 4 * SIZE(AO)
|
|
+ ST c06, 5 * SIZE(AO)
|
|
+ ST c07, 6 * SIZE(AO)
|
|
+ ST c08, 7 * SIZE(AO)
|
|
+
|
|
+ ST c09, 8 * SIZE(AO)
|
|
+ ST c10, 9 * SIZE(AO)
|
|
+ ST c11, 10 * SIZE(AO)
|
|
+ ST c12, 11 * SIZE(AO)
|
|
+
|
|
+ ST c13, 12 * SIZE(AO)
|
|
+ ST c14, 13 * SIZE(AO)
|
|
+ ST c15, 14 * SIZE(AO)
|
|
+ ST c16, 15 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+ ldi C3, -4 * SIZE(C3)
|
|
+ ldi C4, -4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+ ST c07, 2 * SIZE(C2)
|
|
+ ST c08, 3 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c11, 2 * SIZE(C3)
|
|
+ ST c12, 3 * SIZE(C3)
|
|
+
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+ ST c15, 2 * SIZE(C4)
|
|
+ ST c16, 3 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+ ldi C3, 4 * SIZE(C3)
|
|
+ ldi C4, 4 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 2, I
|
|
+ ble I, $L30
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble KK, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c10
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c02
|
|
+ fclr c06
|
|
+ ble TMP1, $L28
|
|
+
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD c09, t1, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+
|
|
+ ADD c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD c10, t2, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD c13, t3, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD c14, t4, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD c05, t3, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t1, c09
|
|
+ ADD c10, t2, c10
|
|
+ ADD c13, t3, c13
|
|
+ ADD c14, t4, c14
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+
|
|
+ SUB b1, c02, c02
|
|
+ SUB b2, c06, c06
|
|
+ SUB b3, c10, c10
|
|
+ SUB b4, c14, c14
|
|
+
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c05, c05
|
|
+ SUB a4, c06, c06
|
|
+
|
|
+ SUB b1, c09, c09
|
|
+ SUB b2, c10, c10
|
|
+ SUB b3, c13, c13
|
|
+ SUB b4, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 3 * SIZE(AO)
|
|
+ LD a2, 2 * SIZE(AO)
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c06, c06
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c06, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c14, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c05, t2, c05
|
|
+ SUB c09, t3, c09
|
|
+ SUB c13, t4, c13
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c05, c05
|
|
+ MUL a3, c09, c09
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c05, t2
|
|
+ MUL a2, c09, t3
|
|
+ MUL a2, c13, t4
|
|
+
|
|
+ SUB c02, t1, c02
|
|
+ SUB c06, t2, c06
|
|
+ SUB c10, t3, c10
|
|
+ SUB c14, t4, c14
|
|
+
|
|
+ MUL a3, c02, c02
|
|
+ MUL a3, c06, c06
|
|
+ MUL a3, c10, c10
|
|
+ MUL a3, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ MUL a2, c01, t1
|
|
+ MUL a2, c02, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a4, c01, t1
|
|
+ MUL a4, c02, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b1, c06, c06
|
|
+
|
|
+ MUL b2, c05, t1
|
|
+ MUL b2, c06, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL b3, c05, t1
|
|
+ MUL b3, c06, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ MUL a2, c09, t1
|
|
+ MUL a2, c10, t2
|
|
+
|
|
+ SUB c13, t1, c13
|
|
+ SUB c14, t2, c14
|
|
+
|
|
+ MUL a3, c13, c13
|
|
+ MUL a3, c14, c14
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a1, c14, c14
|
|
+
|
|
+ MUL a2, c13, t1
|
|
+ MUL a2, c14, t2
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a3, c13, t1
|
|
+ MUL a3, c14, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL a4, c13, t1
|
|
+ MUL a4, c14, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b1, c10, c10
|
|
+
|
|
+ MUL b2, c09, t1
|
|
+ MUL b2, c10, t2
|
|
+
|
|
+ SUB c05, t1, c05
|
|
+ SUB c06, t2, c06
|
|
+
|
|
+ MUL b3, c09, t1
|
|
+ MUL b3, c10, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c06, c06
|
|
+
|
|
+ MUL a2, c05, t1
|
|
+ MUL a2, c06, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, c01
|
|
+ MUL a3, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+
|
|
+ ST c02, 4 * SIZE(BO)
|
|
+ ST c06, 5 * SIZE(BO)
|
|
+ ST c10, 6 * SIZE(BO)
|
|
+ ST c14, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c05, 2 * SIZE(AO)
|
|
+ ST c06, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c13, 6 * SIZE(AO)
|
|
+ ST c14, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+ ldi C3, -2 * SIZE(C3)
|
|
+ ldi C4, -2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c06, 1 * SIZE(C2)
|
|
+
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c10, 1 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+ ST c14, 1 * SIZE(C4)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+ ldi C3, 2 * SIZE(C3)
|
|
+ ldi C4, 2 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 1 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and M, 1, I
|
|
+ ble I, $L39
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ ldi L, -2(KK)
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ ble KK, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, BASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, BASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c01
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ ldi L, -2(TMP1)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c13
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ ble TMP1, $L38
|
|
+
|
|
+ ble L, $L35
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L32:
|
|
+ ADD c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b1, t1
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ LD b5, 3 * SIZE(BO)
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, -1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ MUL a2, b1, t1
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ MUL a2, b2, t2
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ MUL a2, b3, t3
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a2, b5, t4
|
|
+ LD a2, 0 * SIZE(AO)
|
|
+ bgt L, $L32
|
|
+ .align 4
|
|
+
|
|
+$L35:
|
|
+ ADD c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L37
|
|
+#else
|
|
+ blbs TMP1, $L37
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD c05, t2, c05
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ MUL a1, b2, t2
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD c09, t3, c09
|
|
+ MUL a1, b3, t3
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ MUL a1, b4, t4
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L37:
|
|
+ ADD c05, t2, c05
|
|
+ MUL a1, b2, t2
|
|
+ ADD c09, t3, c09
|
|
+ MUL a1, b3, t3
|
|
+
|
|
+ ADD c13, t4, c13
|
|
+ ldi AO, 1 * SIZE(AO)
|
|
+ MUL a1, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD c01, t1, c01
|
|
+ ADD c05, t2, c05
|
|
+ ADD c09, t3, c09
|
|
+ ADD c13, t4, c13
|
|
+
|
|
+$L38:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 4, TMP1
|
|
+#endif
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -1 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c05, c05
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c13, c13
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c05, c05
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a2, c01, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a3, c01, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL a4, c01, t1
|
|
+ SUB c13, t1, c13
|
|
+
|
|
+ LD b1, 5 * SIZE(BO)
|
|
+ LD b2, 6 * SIZE(BO)
|
|
+ LD b3, 7 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c05, c05
|
|
+ MUL b2, c05, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL b3, c05, t1
|
|
+ SUB c13, t1, c13
|
|
+
|
|
+ LD a1, 10 * SIZE(BO)
|
|
+ LD a2, 11 * SIZE(BO)
|
|
+ LD a3, 15 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a2, c09, t1
|
|
+ SUB c13, t1, c13
|
|
+ MUL a3, c13, c13
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 15 * SIZE(BO)
|
|
+ LD a2, 14 * SIZE(BO)
|
|
+ LD a3, 13 * SIZE(BO)
|
|
+ LD a4, 12 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c13, c13
|
|
+ MUL a2, c13, t1
|
|
+ SUB c09, t1, c09
|
|
+ MUL a3, c13, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL a4, c13, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD b1, 10 * SIZE(BO)
|
|
+ LD b2, 9 * SIZE(BO)
|
|
+ LD b3, 8 * SIZE(BO)
|
|
+
|
|
+ MUL b1, c09, c09
|
|
+ MUL b2, c09, t1
|
|
+ SUB c05, t1, c05
|
|
+ MUL b3, c09, t1
|
|
+ SUB c01, t1, c01
|
|
+
|
|
+ LD a1, 5 * SIZE(BO)
|
|
+ LD a2, 4 * SIZE(BO)
|
|
+ LD a3, 0 * SIZE(BO)
|
|
+
|
|
+ MUL a1, c05, c05
|
|
+ MUL a2, c05, t1
|
|
+ SUB c01, t1, c01
|
|
+ MUL a3, c01, c01
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c05, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c13, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c05, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c13, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -1 * SIZE(C1)
|
|
+ ldi C2, -1 * SIZE(C2)
|
|
+ ldi C3, -1 * SIZE(C3)
|
|
+ ldi C4, -1 * SIZE(C4)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c05, 0 * SIZE(C2)
|
|
+ ST c09, 0 * SIZE(C3)
|
|
+ ST c13, 0 * SIZE(C4)
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, 0 + BASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, BASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, BASE_SHIFT + 2, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L39:
|
|
+#ifdef LN
|
|
+ sll K, 2 + BASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 4, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 4, KK
|
|
+#endif
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zamax.S b/kernel/sw_64/zamax.S
|
|
new file mode 100644
|
|
index 0000000..c453e9d
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zamax.S
|
|
@@ -0,0 +1,302 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 8 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr $f16
|
|
+ cmplt $31, N, $2
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fclr $f17
|
|
+ cmplt $31, INCX, $3
|
|
+ unop
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fclr $f18
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ unop
|
|
+
|
|
+ fstd $f5, 24($sp)
|
|
+ fclr $f19
|
|
+ and $2, $3, $0
|
|
+ unop
|
|
+
|
|
+ fstd $f6, 32($sp)
|
|
+ unop
|
|
+
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ fclr $f0
|
|
+ beq $0, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ sra N, 2, $1
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ fabs $f20, $f20
|
|
+ fabs $f21, $f21
|
|
+ faddd $f20, $f21, $f0
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fmov $f0, $f1
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fmov $f0, $f2
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fmov $f0, $f3
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f8
|
|
+ fabs $f21, $f9
|
|
+ fabs $f22, $f10
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ fabs $f25, $f13
|
|
+ fabs $f26, $f14
|
|
+ fabs $f27, $f15
|
|
+
|
|
+ ble $1, $L14
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ ldi $1, -1($1)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd $f8, $f9, $f16
|
|
+ unop
|
|
+ fabs $f20, $f8
|
|
+ fillcs 64 * SIZE(X)
|
|
+
|
|
+ faddd $f10, $f11, $f17
|
|
+ unop
|
|
+ fabs $f21, $f9
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+
|
|
+ faddd $f12, $f13, $f18
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ fabs $f22, $f10
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd $f14, $f15, $f19
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f23, $f11
|
|
+ unop
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ fabs $f24, $f12
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fabs $f25, $f13
|
|
+ unop
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ fabs $f26, $f14
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fabs $f27, $f15
|
|
+ unop
|
|
+
|
|
+ fselne $f4, $f16, $f0, $f0
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+ fselne $f5, $f17, $f1, $f1
|
|
+ fselne $f6, $f18, $f2, $f2
|
|
+ fselne $f7, $f19, $f3, $f3
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ faddd $f8, $f9, $f16
|
|
+ fabs $f20, $f8
|
|
+
|
|
+ faddd $f10, $f11, $f17
|
|
+ fabs $f21, $f9
|
|
+
|
|
+ faddd $f12, $f13, $f18
|
|
+ fabs $f22, $f10
|
|
+
|
|
+ faddd $f14, $f15, $f19
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ fabs $f24, $f12
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ fabs $f25, $f13
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ fabs $f26, $f14
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ fabs $f27, $f15
|
|
+
|
|
+ fselne $f4, $f16, $f0, $f0
|
|
+ fselne $f5, $f17, $f1, $f1
|
|
+ fselne $f6, $f18, $f2, $f2
|
|
+ fselne $f7, $f19, $f3, $f3
|
|
+ .align 4
|
|
+
|
|
+$L14:
|
|
+ faddd $f8, $f9, $f16
|
|
+ faddd $f10, $f11, $f17
|
|
+ faddd $f12, $f13, $f18
|
|
+ faddd $f14, $f15, $f19
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ CMPLT($f3, $f19), $f7
|
|
+
|
|
+ fselne $f4, $f16, $f0, $f0
|
|
+ fselne $f5, $f17, $f1, $f1
|
|
+ fselne $f6, $f18, $f2, $f2
|
|
+ fselne $f7, $f19, $f3, $f3
|
|
+
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ CMPLT($f2, $f3), $f17
|
|
+
|
|
+ fselne $f16, $f1, $f0, $f0
|
|
+ fselne $f17, $f3, $f2, $f2
|
|
+
|
|
+ CMPLT($f0, $f2), $f16
|
|
+ fselne $f16, $f2, $f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 3, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $End
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ fabs $f21, $f30
|
|
+ faddd $f29, $f30, $f20
|
|
+ fmov $f20,$f29
|
|
+
|
|
+ CMPLT($f0, $f29), $f16
|
|
+ fselne $f16, $f29, $f0, $f0
|
|
+
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zamax.S.bak b/kernel/sw_64/zamax.S.bak
|
|
new file mode 100644
|
|
index 0000000..74b9331
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zamax.S.bak
|
|
@@ -0,0 +1,301 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+
|
|
+#ifndef USE_MIN
|
|
+#define CMPLT(a, b) fcmplt a, b
|
|
+#else
|
|
+#define CMPLT(a, b) fcmplt b, a
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 8 * 8
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr $f16
|
|
+ cmplt $31, N, $2
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fclr $f17
|
|
+ cmplt $31, INCX, $3
|
|
+ unop
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fclr $f18
|
|
+ SXADDQ INCX, $31, INCX
|
|
+ unop
|
|
+
|
|
+ fstd $f5, 24($sp)
|
|
+ fclr $f19
|
|
+ and $2, $3, $0
|
|
+ unop
|
|
+
|
|
+ fstd $f6, 32($sp)
|
|
+ unop
|
|
+
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ fclr $f0
|
|
+ beq $0, $End # if (n <= 0) or (incx <= 0) return
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ sra N, 2, $1
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ fabs $f20, $f20
|
|
+ fabs $f21, $f21
|
|
+ faddd $f20, $f21, $f0
|
|
+ ble $1, $L15
|
|
+ .align 4
|
|
+
|
|
+ ldi $1, -1($1)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+ unop
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fmov $f0, $f1
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fmov $f0, $f2
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fmov $f0, $f3
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f8
|
|
+ fabs $f21, $f9
|
|
+ fabs $f22, $f10
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ fabs $f24, $f12
|
|
+ fabs $f25, $f13
|
|
+ fabs $f26, $f14
|
|
+ fabs $f27, $f15
|
|
+
|
|
+ ble $1, $L14
|
|
+ .align 4
|
|
+
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ ldi $1, -1($1)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ble $1, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd $f8, $f9, $f16
|
|
+ unop
|
|
+ fabs $f20, $f8
|
|
+ fillcs 64 * SIZE(X)
|
|
+
|
|
+ faddd $f10, $f11, $f17
|
|
+ unop
|
|
+ fabs $f21, $f9
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+
|
|
+ faddd $f12, $f13, $f18
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ fabs $f22, $f10
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd $f14, $f15, $f19
|
|
+ LD $f22, 0 * SIZE(X)
|
|
+ fabs $f23, $f11
|
|
+ unop
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ LD $f23, 1 * SIZE(X)
|
|
+ fabs $f24, $f12
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ LD $f24, 0 * SIZE(X)
|
|
+ fabs $f25, $f13
|
|
+ unop
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ LD $f25, 1 * SIZE(X)
|
|
+ fabs $f26, $f14
|
|
+ addl X, INCX, X
|
|
+
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ LD $f26, 0 * SIZE(X)
|
|
+ fabs $f27, $f15
|
|
+ unop
|
|
+
|
|
+fselne $f4,$f16,$f0, $f0
|
|
+ LD $f27, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ ldi $1, -1($1) # i --
|
|
+
|
|
+fselne $f5,$f17,$f1, $f1
|
|
+fselne $f6,$f18,$f2, $f2
|
|
+fselne $f7,$f19,$f3, $f3
|
|
+ bgt $1,$L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ faddd $f8, $f9, $f16
|
|
+ fabs $f20, $f8
|
|
+
|
|
+ faddd $f10, $f11, $f17
|
|
+ fabs $f21, $f9
|
|
+
|
|
+ faddd $f12, $f13, $f18
|
|
+ fabs $f22, $f10
|
|
+
|
|
+ faddd $f14, $f15, $f19
|
|
+ fabs $f23, $f11
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ fabs $f24, $f12
|
|
+
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ fabs $f25, $f13
|
|
+
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ fabs $f26, $f14
|
|
+ CMPLT($f3, $f19), $f7
|
|
+ fabs $f27, $f15
|
|
+
|
|
+fselne $f4,$f16,$f0, $f0
|
|
+fselne $f5,$f17,$f1, $f1
|
|
+fselne $f6,$f18,$f2, $f2
|
|
+fselne $f7,$f19,$f3, $f3
|
|
+ .align 4
|
|
+
|
|
+$L14:
|
|
+ faddd $f8, $f9, $f16
|
|
+ faddd $f10, $f11, $f17
|
|
+ faddd $f12, $f13, $f18
|
|
+ faddd $f14, $f15, $f19
|
|
+
|
|
+ CMPLT($f0, $f16), $f4
|
|
+ CMPLT($f1, $f17), $f5
|
|
+ CMPLT($f2, $f18), $f6
|
|
+ CMPLT($f3, $f19), $f7
|
|
+
|
|
+fselne $f4,$f16,$f0, $f0
|
|
+fselne $f5,$f17,$f1, $f1
|
|
+fselne $f6,$f18,$f2, $f2
|
|
+fselne $f7,$f19,$f3, $f3
|
|
+
|
|
+ CMPLT($f0, $f1), $f16
|
|
+ CMPLT($f2, $f3), $f17
|
|
+
|
|
+fselne $f16,$f1,$f0, $f0
|
|
+fselne $f17,$f3,$f2, $f2
|
|
+
|
|
+ CMPLT($f0, $f2), $f16
|
|
+fselne $f16,$f2,$f0, $f0
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 3, $1
|
|
+ unop
|
|
+ unop
|
|
+ ble $1, $End
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f20, 0 * SIZE(X)
|
|
+ LD $f21, 1 * SIZE(X)
|
|
+ unop
|
|
+ addl X, INCX, X
|
|
+
|
|
+ fabs $f20, $f29
|
|
+ fabs $f21, $f30
|
|
+ faddd $f29, $f30, $f29
|
|
+
|
|
+ CMPLT($f0, $f29), $f16
|
|
+fselne $f16,$f29,$f0, $f0
|
|
+
|
|
+ ldi $1, -1($1) # i --
|
|
+ bgt $1, $L16
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zasum.S b/kernel/sw_64/zasum.S
|
|
new file mode 100644
|
|
index 0000000..72e120c
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zasum.S
|
|
@@ -0,0 +1,231 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define I $19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+#define t4 $f24
|
|
+#define s4 $f27
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ fclr s0
|
|
+ unop
|
|
+ fclr t0
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ fclr s1
|
|
+ unop
|
|
+ fclr t1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr s2
|
|
+ sra N, 2, I
|
|
+ fclr s3
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ fabs a2, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a3, t3
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fabs a4, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fabs a5, t1
|
|
+ unop
|
|
+
|
|
+ fadds s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ fabs a6, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ fabs a7, t3
|
|
+ unop
|
|
+
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ fabs a2, t2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ fabs a3, t3
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ fabs a4, t0
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ fabs a5, t1
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ fabs a6, t2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ fabs a7, t3
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, s2, $f25
|
|
+ fmov $f25, s0
|
|
+ and N, 3, I
|
|
+ ADD s1, s3, $f25
|
|
+ fmov $f25, s1
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, $f25
|
|
+ fmov $f25, s0
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, $f25
|
|
+ fmov $f25, s1
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s0, t0, $f25
|
|
+ ADD s1, t1, $f26
|
|
+
|
|
+ ADD $f25, $f26, s0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zasum.S.bak b/kernel/sw_64/zasum.S.bak
|
|
new file mode 100644
|
|
index 0000000..db79771
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zasum.S.bak
|
|
@@ -0,0 +1,208 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define I $19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ fclr s0
|
|
+ unop
|
|
+ fclr t0
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ fclr s1
|
|
+ unop
|
|
+ fclr t1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr s2
|
|
+ sra N, 2, I
|
|
+ fclr s3
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, s0
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ fabs a2, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a3, t3
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fabs a4, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fabs a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ fabs a6, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ fabs a7, t3
|
|
+ unop
|
|
+
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, s0
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ fabs a2, t2
|
|
+ ADD s3, t3, s3
|
|
+ fabs a3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ fabs a4, t0
|
|
+ ADD s1, t1, s1
|
|
+ fabs a5, t1
|
|
+ ADD s2, t2, s2
|
|
+ fabs a6, t2
|
|
+ ADD s3, t3, s3
|
|
+ fabs a7, t3
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, s2, s0
|
|
+ and N, 3, I
|
|
+ ADD s1, s3, s1
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, s0
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t1, s1
|
|
+
|
|
+ ADD s0, s1, s0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zasum_simd.S b/kernel/sw_64/zasum_simd.S
|
|
new file mode 100644
|
|
index 0000000..5606fdf
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zasum_simd.S
|
|
@@ -0,0 +1,385 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 96
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define I $19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+
|
|
+#define t4 $f24
|
|
+#define t5 $f25
|
|
+#define t6 $f26
|
|
+#define t7 $f27
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ fclr s0
|
|
+ unop
|
|
+ fclr t0
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ fclr s1
|
|
+ unop
|
|
+ fclr t1
|
|
+ ble N, $L999
|
|
+
|
|
+ cmpeq INCX, 2, $3
|
|
+ beq $3, $Sub
|
|
+ .align 4
|
|
+
|
|
+ and X, (VEC_LEN*SIZE-1), $6
|
|
+ bgt $6, $UnAlign_X_ACCESS
|
|
+ .align 4
|
|
+$Align_Access:
|
|
+
|
|
+/*
|
|
+ Unloop 8*2= 16 reals
|
|
+*/
|
|
+ sra N, 3, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $Remain
|
|
+
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t0
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t1
|
|
+
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t2
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t3
|
|
+
|
|
+ subl I, 1, I
|
|
+ addl X, 16*SIZE, X
|
|
+ unop
|
|
+ ble I, $MainLoopEnd
|
|
+
|
|
+$MainLoop:
|
|
+ vcpys $f31, a0, a4
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, a1, a5
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ vcpys $f31, a2, a6
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, a3, a7
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VADD t0, a4, t0
|
|
+ subl I, 1, I
|
|
+ VADD t1, a5, t1
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+
|
|
+ VADD t2, a6, t2
|
|
+ addl X, 16*SIZE, X
|
|
+ VADD t3, a7, t3
|
|
+ bgt I, $MainLoop
|
|
+
|
|
+$MainLoopEnd:
|
|
+ /*fabs*/
|
|
+
|
|
+ vcpys $f31, a0, a4
|
|
+ vcpys $f31, a1, a5
|
|
+ vcpys $f31, a2, a6
|
|
+ vcpys $f31, a3, a7
|
|
+
|
|
+ VADD t0, a4, t0
|
|
+ VADD t1, a5, t1
|
|
+ VADD t2, a6, t2
|
|
+ VADD t3, a7, t3
|
|
+
|
|
+ VADD t0, t1, t0
|
|
+ VADD t2, t3, t2
|
|
+ VADD t0, t2, t0
|
|
+ nop
|
|
+
|
|
+ vextf t0, 0, s0
|
|
+ vextf t0, 1, s1
|
|
+ vextf t0, 2, s2
|
|
+ vextf t0, 3, s3
|
|
+
|
|
+$Remain:
|
|
+ and N, 7, I
|
|
+ ADD s0, s2, s0
|
|
+ ADD s1, s3, s1
|
|
+ ble I, $End
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ addl X, 2*SIZE, X
|
|
+
|
|
+ fabs a1, t1
|
|
+ ldi I, -1(I)
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t1, s1
|
|
+
|
|
+ bne I, $RemainLoop
|
|
+ .align 4
|
|
+$End:
|
|
+ ADD s0, s1, s0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_ACCESS:
|
|
+ sra N, 3, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $Remain
|
|
+
|
|
+ VLD_UL a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t0
|
|
+ VLD_UH t4, 1*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t1
|
|
+
|
|
+ VLD_UL a1, 1*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t2
|
|
+ VLD_UH t5, 2*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t3
|
|
+
|
|
+ VLD_UL a2, 2*VEC_LEN*SIZE(X)
|
|
+ VLD_UH t6, 3*VEC_LEN*SIZE(X)
|
|
+ VLD_UL a3, 3*VEC_LEN*SIZE(X)
|
|
+ VLD_UH t7, 4*VEC_LEN*SIZE(X)
|
|
+
|
|
+ vbisw a0, t4, a0
|
|
+ subl I, 1, I
|
|
+ vbisw a1, t5, a1
|
|
+ addl X, 16*SIZE, X
|
|
+
|
|
+ vbisw a2, t6, a2
|
|
+ unop
|
|
+ vbisw a3, t7, a3
|
|
+ ble I, $MainLoopEnd
|
|
+
|
|
+$UnAlign_X_ACCESS_MainLoop:
|
|
+/*fabs*/
|
|
+ vcpys $f31, a0, a4
|
|
+ VLD_UL a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, a1, a5
|
|
+ VLD_UH t4, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ vcpys $f31, a2, a6
|
|
+ VLD_UL a1, 1*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, a3, a7
|
|
+ VLD_UH t5, 2*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VADD t0, a4, t0
|
|
+ VLD_UL a2, 2*VEC_LEN*SIZE(X)
|
|
+ VADD t1, a5, t1
|
|
+ VLD_UH t6, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VADD t2, a6, t2
|
|
+ VLD_UL a3, 3*VEC_LEN*SIZE(X)
|
|
+ VADD t3, a7, t3
|
|
+ VLD_UH t7, 4*VEC_LEN*SIZE(X)
|
|
+
|
|
+
|
|
+ vbisw a0, t4, a0
|
|
+ subl I, 1, I
|
|
+ vbisw a1, t5, a1
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+
|
|
+ vbisw a2, t6, a2
|
|
+ addl X, 16*SIZE, X
|
|
+ vbisw a3, t7, a3
|
|
+ bgt I, $UnAlign_X_ACCESS_MainLoop
|
|
+
|
|
+ jmp $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$Sub:
|
|
+ fclr s2
|
|
+ sra N, 2, I
|
|
+ fclr s3
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, s0
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ fabs a2, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a3, t3
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fabs a4, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fabs a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ fabs a6, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ fabs a7, t3
|
|
+ unop
|
|
+
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, s0
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ fabs a2, t2
|
|
+ ADD s3, t3, s3
|
|
+ fabs a3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ fabs a4, t0
|
|
+ ADD s1, t1, s1
|
|
+ fabs a5, t1
|
|
+ ADD s2, t2, s2
|
|
+ fabs a6, t2
|
|
+ ADD s3, t3, s3
|
|
+ fabs a7, t3
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, s2, s0
|
|
+ and N, 3, I
|
|
+ ADD s1, s3, s1
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, s0
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fabs a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fabs a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t1, s1
|
|
+
|
|
+ ADD s0, s1, s0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zaxpy.S b/kernel/sw_64/zaxpy.S
|
|
new file mode 100644
|
|
index 0000000..19b6398
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zaxpy.S
|
|
@@ -0,0 +1,654 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 40
|
|
+
|
|
+#ifndef CONJ
|
|
+#define ADD1 SUB
|
|
+#define ADD2 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#endif
|
|
+
|
|
+#define tmp $f9
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ ldw $19, 0($sp)
|
|
+ fmov $f19, $f29
|
|
+ ldl $20, 8($sp)
|
|
+ fmov $f20, $f30
|
|
+
|
|
+ mov $21, $18
|
|
+ ldw $21, 16($sp)
|
|
+ ldi $sp, -64($sp)
|
|
+ nop
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ cmpeq $19, 1, $1
|
|
+ fstd $f3, 8($sp)
|
|
+ cmpeq $21, 1, $2
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ and $16, 3, $5
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd tmp, 56($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ and $1, $2, $1
|
|
+ ble $16, $End
|
|
+ sra $16, 2, $4
|
|
+ beq $1, $Sub
|
|
+
|
|
+ ble $4, $Remain
|
|
+ subl $4, 1, $4
|
|
+
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ LD $f2, 2*SIZE($18)
|
|
+ LD $f3, 3*SIZE($18)
|
|
+ LD $f4, 4*SIZE($18)
|
|
+ LD $f5, 5*SIZE($18)
|
|
+ LD $f6, 6*SIZE($18)
|
|
+ LD $f7, 7*SIZE($18)
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ LD $f10, 2*SIZE($20)
|
|
+ LD $f11, 3*SIZE($20)
|
|
+ LD $f12, 4*SIZE($20)
|
|
+ LD $f13, 5*SIZE($20)
|
|
+ LD $f14, 6*SIZE($20)
|
|
+ LD $f15, 7*SIZE($20)
|
|
+
|
|
+ addl $18, 8*SIZE, $18
|
|
+ ble $4, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+ fillcs PREFETCHSIZE * SIZE($18)
|
|
+
|
|
+ MUL $f29, $f0, $f20
|
|
+ fillcs 9*SIZE($18)
|
|
+ MUL $f30, $f1, $f21
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ unop
|
|
+ MUL $f30, $f3, $f25
|
|
+ nop
|
|
+
|
|
+ MUL $f30, $f2, $f26
|
|
+ LD $f2, 2*SIZE($18)
|
|
+ MUL $f29, $f3, $f27
|
|
+ LD $f3, 3*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+ MUL $f30, $f4, $f22
|
|
+ LD $f4, 4*SIZE($18)
|
|
+
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ addl $20, 8*SIZE, $20
|
|
+ MUL $f29, $f5, $f23
|
|
+ LD $f5, 5*SIZE($18)
|
|
+
|
|
+ ADD $f16, $f8, tmp
|
|
+ fmov tmp, $f16
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ MUL $f29, $f6, $f24
|
|
+ unop
|
|
+
|
|
+ ADD $f17, $f28, tmp
|
|
+ fmov tmp, $f17
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ MUL $f30, $f7, $f25
|
|
+ unop
|
|
+
|
|
+ ADD $f18, $f10, tmp
|
|
+ fmov tmp, $f18
|
|
+ LD $f10, 2*SIZE($20)
|
|
+ MUL $f30, $f6, $f26
|
|
+ LD $f6, 6*SIZE($18)
|
|
+
|
|
+ ADD $f19, $f11, tmp
|
|
+ fmov tmp, $f19
|
|
+ LD $f11, 3*SIZE($20)
|
|
+ MUL $f29, $f7, $f27
|
|
+ LD $f7, 7*SIZE($18)
|
|
+
|
|
+ ST $f16,-8*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17,-7*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ ST $f18,-6*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ ST $f19,-5*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+
|
|
+ ADD $f16, $f12, tmp
|
|
+ fmov tmp, $f16
|
|
+ LD $f12, 4*SIZE($20)
|
|
+ ADD $f17, $f13, tmp
|
|
+ fmov tmp, $f17
|
|
+ LD $f13, 5*SIZE($20)
|
|
+ ADD $f18, $f14, tmp
|
|
+ fmov tmp, $f18
|
|
+ LD $f14, 6*SIZE($20)
|
|
+ ADD $f19, $f15, tmp
|
|
+ fmov tmp, $f19
|
|
+ LD $f15, 7*SIZE($20)
|
|
+
|
|
+ ST $f16,-4*SIZE($20)
|
|
+ addl $18, 8*SIZE, $18
|
|
+ ST $f17,-3*SIZE($20)
|
|
+ subl $4, 1, $4
|
|
+
|
|
+ ST $f18,-2*SIZE($20)
|
|
+ nop
|
|
+ ST $f19,-1*SIZE($20)
|
|
+ bgt $4, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ MUL $f30, $f3, $f25
|
|
+ MUL $f30, $f2, $f26
|
|
+ MUL $f29, $f3, $f27
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ MUL $f30, $f4, $f22
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ MUL $f29, $f5, $f23
|
|
+
|
|
+ ADD $f16, $f8, tmp
|
|
+ fmov tmp, $f16
|
|
+ MUL $f29, $f6, $f24
|
|
+ ADD $f17, $f28, tmp
|
|
+ fmov tmp, $f17
|
|
+ MUL $f30, $f7, $f25
|
|
+
|
|
+ ADD $f18, $f10, tmp
|
|
+ fmov tmp, $f18
|
|
+ MUL $f30, $f6, $f26
|
|
+ ADD $f19, $f11, tmp
|
|
+ fmov tmp, $f19
|
|
+ MUL $f29, $f7, $f27
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ ST $f18, 2*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ ST $f19, 3*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+
|
|
+ ADD $f16, $f12, tmp
|
|
+ fmov tmp, $f16
|
|
+ ADD $f17, $f13, tmp
|
|
+ fmov tmp, $f17
|
|
+ ADD $f18, $f14, tmp
|
|
+ fmov tmp, $f18
|
|
+ ADD $f19, $f15, tmp
|
|
+ fmov tmp, $f19
|
|
+
|
|
+ ST $f16, 4*SIZE($20)
|
|
+ ST $f17, 5*SIZE($20)
|
|
+ ST $f18, 6*SIZE($20)
|
|
+ ST $f19, 7*SIZE($20)
|
|
+
|
|
+ unop
|
|
+ addl $20, 8*SIZE, $20
|
|
+ unop
|
|
+ ble $5, $End
|
|
+ .align 4
|
|
+
|
|
+$Remain:
|
|
+ subl $5, 1, $6
|
|
+ ble $5, $End
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ addl $18, 2*SIZE, $18
|
|
+ ble $6, $RemainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ subl $6, 1, $6
|
|
+ MUL $f30, $f1, $f21
|
|
+ addl $20, 2*SIZE, $20
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, tmp
|
|
+ fmov tmp, $f16
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ ADD $f17, $f28, tmp
|
|
+ fmov tmp, $f17
|
|
+ LD $f28, 1*SIZE($20)
|
|
+
|
|
+ ST $f16,-2*SIZE($20)
|
|
+ addl $18, 2*SIZE, $18
|
|
+ ST $f17,-1*SIZE($20)
|
|
+ bgt $6, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+$RemainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, tmp
|
|
+ fmov tmp, $f16
|
|
+ ADD $f17, $f28, tmp
|
|
+ fmov tmp, $f17
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ nop
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ nop
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd tmp, 56($sp)
|
|
+ ldi $sp, 64($sp)
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ SXSUBL $16, SIZE, $22
|
|
+ addl $22, $22, $22 # Complex
|
|
+ .align 4
|
|
+
|
|
+ addl $19, $19, $19 # Complex
|
|
+ addl $21, $21, $21 # Complex
|
|
+
|
|
+ ble $4, $SubRemain
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f2, 0*SIZE($18)
|
|
+ LD $f3, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f4, 0*SIZE($18)
|
|
+ LD $f5, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f6, 0*SIZE($18)
|
|
+ LD $f7, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $24
|
|
+
|
|
+ LD $f10, 0*SIZE($24)
|
|
+ LD $f11, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ LD $f12, 0*SIZE($24)
|
|
+ LD $f13, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ LD $f14, 0*SIZE($24)
|
|
+ LD $f15, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ble $4, $SubMainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$SubMainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ unop
|
|
+ MUL $f30, $f1, $f21
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ SXADDQ $19, $18, $18
|
|
+ MUL $f30, $f3, $f25
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f2, $f26
|
|
+ LD $f2, 0*SIZE($18)
|
|
+ MUL $f29, $f3, $f27
|
|
+ LD $f3, 1*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ SXADDQ $19, $18, $18
|
|
+ MUL $f29, $f4, $f20
|
|
+ unop
|
|
+
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ unop
|
|
+ MUL $f30, $f5, $f21
|
|
+ unop
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+ MUL $f30, $f4, $f22
|
|
+ LD $f4, 0*SIZE($18)
|
|
+
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ unop
|
|
+ MUL $f29, $f5, $f23
|
|
+ LD $f5, 1*SIZE($18)
|
|
+
|
|
+ ADD $f16, $f8, tmp
|
|
+ fmov tmp, $f16
|
|
+ LD $f8, 0*SIZE($24)
|
|
+ MUL $f29, $f6, $f24
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ ADD $f17, $f28, tmp
|
|
+ fmov tmp, $f17
|
|
+ LD $f28, 1*SIZE($24)
|
|
+ MUL $f30, $f7, $f25
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ADD $f18, $f10, tmp
|
|
+ fmov tmp, $f18
|
|
+ LD $f10, 0*SIZE($24)
|
|
+ MUL $f30, $f6, $f26
|
|
+ LD $f6, 0*SIZE($18)
|
|
+
|
|
+ ADD $f19, $f11, tmp
|
|
+ fmov tmp, $f19
|
|
+ LD $f11, 1*SIZE($24)
|
|
+ MUL $f29, $f7, $f27
|
|
+ LD $f7, 1*SIZE($18)
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ SXADDQ $19, $18, $18
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ unop
|
|
+
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ unop
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ SXADDQ $21, $24, $24
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ unop
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f16, $f12, tmp
|
|
+ fmov tmp, $f16
|
|
+ unop
|
|
+ LD $f12, 0*SIZE($24)
|
|
+ unop
|
|
+
|
|
+ ADD $f17, $f13, tmp
|
|
+ fmov tmp, $f17
|
|
+ unop
|
|
+ LD $f13, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ADD $f18, $f14, tmp
|
|
+ fmov tmp, $f18
|
|
+ subl $4, 1, $4
|
|
+ LD $f14, 0*SIZE($24)
|
|
+ unop
|
|
+
|
|
+ ADD $f19, $f15, tmp
|
|
+ fmov tmp, $f19
|
|
+ unop
|
|
+ LD $f15, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ unop
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ bgt $4, $SubMainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubMainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ MUL $f30, $f3, $f25
|
|
+ MUL $f30, $f2, $f26
|
|
+ MUL $f29, $f3, $f27
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ MUL $f30, $f4, $f22
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ MUL $f29, $f5, $f23
|
|
+
|
|
+ ADD $f16, $f8, tmp
|
|
+ fmov tmp, $f16
|
|
+ MUL $f29, $f6, $f24
|
|
+ ADD $f17, $f28, tmp
|
|
+ fmov tmp, $f17
|
|
+ MUL $f30, $f7, $f25
|
|
+
|
|
+ ADD $f18, $f10, tmp
|
|
+ fmov tmp, $f18
|
|
+ MUL $f30, $f6, $f26
|
|
+ ADD $f19, $f11, tmp
|
|
+ fmov tmp, $f19
|
|
+ MUL $f29, $f7, $f27
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ SXADDQ $21, $20, $20
|
|
+ nop
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ SXADDQ $21, $20, $20
|
|
+ ADD $f16, $f12, tmp
|
|
+ fmov tmp, $f16
|
|
+
|
|
+ ADD $f17, $f13, tmp
|
|
+ fmov tmp, $f17
|
|
+ ADD $f18, $f14, tmp
|
|
+ fmov tmp, $f18
|
|
+ ADD $f19, $f15, tmp
|
|
+ fmov tmp, $f19
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ ble $5, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ subl $5, 1, $6
|
|
+ ble $5, $SubEnd
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ SXADDQ $19, $18, $18
|
|
+ SXADDQ $21, $20, $24
|
|
+ ble $6, $SubRemainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ nop
|
|
+ ADD $f16, $f8, tmp
|
|
+ fmov tmp, $f16
|
|
+ LD $f8, 0*SIZE($24)
|
|
+
|
|
+ ADD $f17, $f28, tmp
|
|
+ fmov tmp, $f17
|
|
+ LD $f28, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+ subl $6, 1, $6
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ bgt $6, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, tmp
|
|
+ fmov tmp, $f16
|
|
+ ADD $f17, $f28, tmp
|
|
+ fmov tmp, $f17
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ nop
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ nop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd tmp, 56($sp)
|
|
+ ldi $sp, 64($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zaxpy.S.bak b/kernel/sw_64/zaxpy.S.bak
|
|
new file mode 100644
|
|
index 0000000..c6cd44b
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zaxpy.S.bak
|
|
@@ -0,0 +1,611 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 40
|
|
+
|
|
+#ifndef CONJ
|
|
+#define ADD1 SUB
|
|
+#define ADD2 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#endif
|
|
+
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ ldl $19, 0($sp)
|
|
+ fmov $f19, $f29
|
|
+ ldl $20, 8($sp)
|
|
+ fmov $f20, $f30
|
|
+
|
|
+ mov $21, $18
|
|
+ ldl $21, 16($sp)
|
|
+ ldi $sp, -64($sp)
|
|
+ nop
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ cmpeq $19, 1, $1
|
|
+ fstd $f3, 8($sp)
|
|
+ cmpeq $21, 1, $2
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ and $16, 3, $5
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ and $1, $2, $1
|
|
+ ble $16, $End
|
|
+ sra $16, 2, $4
|
|
+ beq $1, $Sub
|
|
+
|
|
+ ble $4, $Remain
|
|
+ subl $4, 1, $4
|
|
+
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ LD $f2, 2*SIZE($18)
|
|
+ LD $f3, 3*SIZE($18)
|
|
+ LD $f4, 4*SIZE($18)
|
|
+ LD $f5, 5*SIZE($18)
|
|
+ LD $f6, 6*SIZE($18)
|
|
+ LD $f7, 7*SIZE($18)
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ LD $f10, 2*SIZE($20)
|
|
+ LD $f11, 3*SIZE($20)
|
|
+ LD $f12, 4*SIZE($20)
|
|
+ LD $f13, 5*SIZE($20)
|
|
+ LD $f14, 6*SIZE($20)
|
|
+ LD $f15, 7*SIZE($20)
|
|
+
|
|
+ addl $18, 8*SIZE, $18
|
|
+ ble $4, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+ fillcs PREFETCHSIZE * SIZE($18)
|
|
+
|
|
+ MUL $f29, $f0, $f20
|
|
+ fillcs 9*SIZE($18)
|
|
+ MUL $f30, $f1, $f21
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ unop
|
|
+ MUL $f30, $f3, $f25
|
|
+ nop
|
|
+
|
|
+ MUL $f30, $f2, $f26
|
|
+ LD $f2, 2*SIZE($18)
|
|
+ MUL $f29, $f3, $f27
|
|
+ LD $f3, 3*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+ MUL $f30, $f4, $f22
|
|
+ LD $f4, 4*SIZE($18)
|
|
+
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ addl $20, 8*SIZE, $20
|
|
+ MUL $f29, $f5, $f23
|
|
+ LD $f5, 5*SIZE($18)
|
|
+
|
|
+ ADD $f16, $f8, $f16
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ MUL $f29, $f6, $f24
|
|
+ unop
|
|
+
|
|
+ ADD $f17, $f28, $f17
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ MUL $f30, $f7, $f25
|
|
+ unop
|
|
+
|
|
+ ADD $f18, $f10, $f18
|
|
+ LD $f10, 2*SIZE($20)
|
|
+ MUL $f30, $f6, $f26
|
|
+ LD $f6, 6*SIZE($18)
|
|
+
|
|
+ ADD $f19, $f11, $f19
|
|
+ LD $f11, 3*SIZE($20)
|
|
+ MUL $f29, $f7, $f27
|
|
+ LD $f7, 7*SIZE($18)
|
|
+
|
|
+ ST $f16,-8*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17,-7*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ ST $f18,-6*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ ST $f19,-5*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+
|
|
+ ADD $f16, $f12, $f16
|
|
+ LD $f12, 4*SIZE($20)
|
|
+ ADD $f17, $f13, $f17
|
|
+ LD $f13, 5*SIZE($20)
|
|
+ ADD $f18, $f14, $f18
|
|
+ LD $f14, 6*SIZE($20)
|
|
+ ADD $f19, $f15, $f19
|
|
+ LD $f15, 7*SIZE($20)
|
|
+
|
|
+ ST $f16,-4*SIZE($20)
|
|
+ addl $18, 8*SIZE, $18
|
|
+ ST $f17,-3*SIZE($20)
|
|
+ subl $4, 1, $4
|
|
+
|
|
+ ST $f18,-2*SIZE($20)
|
|
+ nop
|
|
+ ST $f19,-1*SIZE($20)
|
|
+ bgt $4, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ MUL $f30, $f3, $f25
|
|
+ MUL $f30, $f2, $f26
|
|
+ MUL $f29, $f3, $f27
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ MUL $f30, $f4, $f22
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ MUL $f29, $f5, $f23
|
|
+
|
|
+ ADD $f16, $f8, $f16
|
|
+ MUL $f29, $f6, $f24
|
|
+ ADD $f17, $f28, $f17
|
|
+ MUL $f30, $f7, $f25
|
|
+
|
|
+ ADD $f18, $f10, $f18
|
|
+ MUL $f30, $f6, $f26
|
|
+ ADD $f19, $f11, $f19
|
|
+ MUL $f29, $f7, $f27
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ ST $f18, 2*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ ST $f19, 3*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+
|
|
+ ADD $f16, $f12, $f16
|
|
+ ADD $f17, $f13, $f17
|
|
+ ADD $f18, $f14, $f18
|
|
+ ADD $f19, $f15, $f19
|
|
+
|
|
+ ST $f16, 4*SIZE($20)
|
|
+ ST $f17, 5*SIZE($20)
|
|
+ ST $f18, 6*SIZE($20)
|
|
+ ST $f19, 7*SIZE($20)
|
|
+
|
|
+ unop
|
|
+ addl $20, 8*SIZE, $20
|
|
+ unop
|
|
+ ble $5, $End
|
|
+ .align 4
|
|
+
|
|
+$Remain:
|
|
+ subl $5, 1, $6
|
|
+ ble $5, $End
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ addl $18, 2*SIZE, $18
|
|
+ ble $6, $RemainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ subl $6, 1, $6
|
|
+ MUL $f30, $f1, $f21
|
|
+ addl $20, 2*SIZE, $20
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, $f16
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ ADD $f17, $f28, $f17
|
|
+ LD $f28, 1*SIZE($20)
|
|
+
|
|
+ ST $f16,-2*SIZE($20)
|
|
+ addl $18, 2*SIZE, $18
|
|
+ ST $f17,-1*SIZE($20)
|
|
+ bgt $6, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+$RemainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, $f16
|
|
+ ADD $f17, $f28, $f17
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ nop
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ nop
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ ldi $sp, 64($sp)
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ SXSUBL $16, SIZE, $22
|
|
+ addl $22, $22, $22 # Complex
|
|
+ .align 4
|
|
+
|
|
+ addl $19, $19, $19 # Complex
|
|
+ addl $21, $21, $21 # Complex
|
|
+
|
|
+ ble $4, $SubRemain
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f2, 0*SIZE($18)
|
|
+ LD $f3, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f4, 0*SIZE($18)
|
|
+ LD $f5, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f6, 0*SIZE($18)
|
|
+ LD $f7, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $24
|
|
+
|
|
+ LD $f10, 0*SIZE($24)
|
|
+ LD $f11, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ LD $f12, 0*SIZE($24)
|
|
+ LD $f13, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ LD $f14, 0*SIZE($24)
|
|
+ LD $f15, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ble $4, $SubMainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$SubMainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ unop
|
|
+ MUL $f30, $f1, $f21
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ SXADDQ $19, $18, $18
|
|
+ MUL $f30, $f3, $f25
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f2, $f26
|
|
+ LD $f2, 0*SIZE($18)
|
|
+ MUL $f29, $f3, $f27
|
|
+ LD $f3, 1*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ SXADDQ $19, $18, $18
|
|
+ MUL $f29, $f4, $f20
|
|
+ unop
|
|
+
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ unop
|
|
+ MUL $f30, $f5, $f21
|
|
+ unop
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+ MUL $f30, $f4, $f22
|
|
+ LD $f4, 0*SIZE($18)
|
|
+
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ unop
|
|
+ MUL $f29, $f5, $f23
|
|
+ LD $f5, 1*SIZE($18)
|
|
+
|
|
+ ADD $f16, $f8, $f16
|
|
+ LD $f8, 0*SIZE($24)
|
|
+ MUL $f29, $f6, $f24
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ ADD $f17, $f28, $f17
|
|
+ LD $f28, 1*SIZE($24)
|
|
+ MUL $f30, $f7, $f25
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ADD $f18, $f10, $f18
|
|
+ LD $f10, 0*SIZE($24)
|
|
+ MUL $f30, $f6, $f26
|
|
+ LD $f6, 0*SIZE($18)
|
|
+
|
|
+ ADD $f19, $f11, $f19
|
|
+ LD $f11, 1*SIZE($24)
|
|
+ MUL $f29, $f7, $f27
|
|
+ LD $f7, 1*SIZE($18)
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ SXADDQ $19, $18, $18
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ unop
|
|
+
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ unop
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ SXADDQ $21, $24, $24
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ unop
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f16, $f12, $f16
|
|
+ unop
|
|
+ LD $f12, 0*SIZE($24)
|
|
+ unop
|
|
+
|
|
+ ADD $f17, $f13, $f17
|
|
+ unop
|
|
+ LD $f13, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ADD $f18, $f14, $f18
|
|
+ subl $4, 1, $4
|
|
+ LD $f14, 0*SIZE($24)
|
|
+ unop
|
|
+
|
|
+ ADD $f19, $f15, $f19
|
|
+ unop
|
|
+ LD $f15, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ unop
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ bgt $4, $SubMainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubMainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ MUL $f30, $f3, $f25
|
|
+ MUL $f30, $f2, $f26
|
|
+ MUL $f29, $f3, $f27
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ MUL $f30, $f4, $f22
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ MUL $f29, $f5, $f23
|
|
+
|
|
+ ADD $f16, $f8, $f16
|
|
+ MUL $f29, $f6, $f24
|
|
+ ADD $f17, $f28, $f17
|
|
+ MUL $f30, $f7, $f25
|
|
+
|
|
+ ADD $f18, $f10, $f18
|
|
+ MUL $f30, $f6, $f26
|
|
+ ADD $f19, $f11, $f19
|
|
+ MUL $f29, $f7, $f27
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ SXADDQ $21, $20, $20
|
|
+ nop
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ SXADDQ $21, $20, $20
|
|
+ ADD $f16, $f12, $f16
|
|
+
|
|
+ ADD $f17, $f13, $f17
|
|
+ ADD $f18, $f14, $f18
|
|
+ ADD $f19, $f15, $f19
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ ble $5, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ subl $5, 1, $6
|
|
+ ble $5, $SubEnd
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ SXADDQ $19, $18, $18
|
|
+ SXADDQ $21, $20, $24
|
|
+ ble $6, $SubRemainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ nop
|
|
+ ADD $f16, $f8, $f16
|
|
+ LD $f8, 0*SIZE($24)
|
|
+
|
|
+ ADD $f17, $f28, $f17
|
|
+ LD $f28, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+ subl $6, 1, $6
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ bgt $6, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, $f16
|
|
+ ADD $f17, $f28, $f17
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ nop
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ nop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ ldi $sp, 64($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zaxpy_simd.S b/kernel/sw_64/zaxpy_simd.S
|
|
new file mode 100644
|
|
index 0000000..a823ebf
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zaxpy_simd.S
|
|
@@ -0,0 +1,1479 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 128
|
|
+
|
|
+#ifndef CONJ
|
|
+#define ADD1 SUB
|
|
+#define ADD2 ADD
|
|
+
|
|
+#define VADD1 VSUB
|
|
+#define VADD2 VADD
|
|
+#define VMAD1 VNMAD
|
|
+#define VMAD2 VMAD
|
|
+
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+
|
|
+#define VADD1 VADD
|
|
+#define VADD2 VSUB
|
|
+#define VMAD1 VMAD
|
|
+#define VMAD2 VNMAD
|
|
+
|
|
+#endif
|
|
+
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 64, $26, 0
|
|
+
|
|
+ ldl $19, 0($sp)
|
|
+ fmov $f19, $f29
|
|
+ ldl $20, 8($sp)
|
|
+ fmov $f20, $f30
|
|
+
|
|
+ mov $21, $18
|
|
+ ldl $21, 16($sp)
|
|
+ ldi $sp, -64($sp)
|
|
+ nop
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ cmpeq $19, 1, $1
|
|
+ fstd $f3, 8($sp)
|
|
+ cmpeq $21, 1, $2
|
|
+
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ nop
|
|
+
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+/*
|
|
+ unloop 8: process 8 complex=16 float/double
|
|
+*/
|
|
+ and $1, $2, $1
|
|
+ ble $16, $End
|
|
+ sra $16, 3, $4
|
|
+ and $16, 7, $5
|
|
+
|
|
+ beq $1, $Sub
|
|
+ ble $4, $Remain
|
|
+ subl $4, 1, $4
|
|
+ nop
|
|
+/*extern alpha_r alpha_i to vector*/
|
|
+
|
|
+ vcpyf $f29, $f29
|
|
+ vcpyf $f30, $f30
|
|
+
|
|
+/**
|
|
+ align ?
|
|
+ test the address of Y & X
|
|
+**/
|
|
+ and $20, (VEC_LEN*SIZE-1), $6
|
|
+ bgt $6, $UnAlign_Y_ACCESS
|
|
+
|
|
+ and $18, (VEC_LEN*SIZE-1), $7
|
|
+ nop
|
|
+ nop
|
|
+ bgt $7, $UnAlign_X_ACCESS
|
|
+
|
|
+ .align 4
|
|
+
|
|
+ VLD $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD $f3, 3*VEC_LEN*SIZE($18)
|
|
+
|
|
+/*
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ LD $f2, 2*SIZE($18)
|
|
+ LD $f3, 3*SIZE($18)
|
|
+
|
|
+ LD $f4, 4*SIZE($18)
|
|
+ LD $f5, 5*SIZE($18)
|
|
+ LD $f6, 6*SIZE($18)
|
|
+ LD $f7, 7*SIZE($18)
|
|
+*/
|
|
+
|
|
+ VLD $f8, 0*VEC_LEN*SIZE($20)
|
|
+ VLD $f28, 1*VEC_LEN*SIZE($20)
|
|
+ VLD $f10, 2*VEC_LEN*SIZE($20)
|
|
+ VLD $f11, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+/*
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ LD $f10, 2*SIZE($20)
|
|
+ LD $f11, 3*SIZE($20)
|
|
+ LD $f12, 4*SIZE($20)
|
|
+ LD $f13, 5*SIZE($20)
|
|
+ LD $f14, 6*SIZE($20)
|
|
+ LD $f15, 7*SIZE($20)
|
|
+*/
|
|
+ addl $18, 16*SIZE, $18
|
|
+ ble $4, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+/*
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+ fillcs PREFETCHSIZE * SIZE($18)
|
|
+*/
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+ fillcs PREFETCHSIZE * SIZE($18)
|
|
+
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf $f0, 1, $f4
|
|
+ vextf $f0, 3, $f5
|
|
+ vextf $f1, 0, $f6
|
|
+ vextf $f1, 2, $f7
|
|
+
|
|
+ vextf $f2, 1, $f12
|
|
+ vextf $f2, 3, $f13
|
|
+ vextf $f3, 0, $f14
|
|
+ vextf $f3, 2, $f15
|
|
+
|
|
+ vinsf $f4, $f1, 0, $f1
|
|
+ vinsf $f5, $f1, 2, $f1
|
|
+ vinsf $f6, $f0, 1, $f0
|
|
+ vinsf $f7, $f0, 3, $f0
|
|
+
|
|
+ vinsf $f12, $f3, 0, $f3
|
|
+ vinsf $f13, $f3, 2, $f3
|
|
+ vinsf $f14, $f2, 1, $f2
|
|
+ vinsf $f15, $f2, 3, $f2
|
|
+
|
|
+/*Compute*/
|
|
+ VMUL $f29, $f0, $f20
|
|
+ VMUL $f30, $f0, $f21
|
|
+ VMUL $f29, $f2, $f22
|
|
+ VMUL $f30, $f2, $f23
|
|
+
|
|
+
|
|
+ VMAD1 $f30, $f1, $f20, $f16
|
|
+ VMAD2 $f29, $f1, $f21, $f17
|
|
+ VMAD1 $f30, $f3, $f22, $f18
|
|
+ VMAD2 $f29, $f3, $f23, $f19
|
|
+
|
|
+ VLD $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD $f3, 3*VEC_LEN*SIZE($18)
|
|
+
|
|
+/*combine the real & image vector to complex vector*/
|
|
+ vextf $f16, 1, $f24
|
|
+ vextf $f16, 3, $f25
|
|
+ vextf $f17, 0, $f26
|
|
+ vextf $f17, 2, $f27
|
|
+
|
|
+ vextf $f18, 1, $f12
|
|
+ vextf $f18, 3, $f13
|
|
+ vextf $f19, 0, $f14
|
|
+ vextf $f19, 2, $f15
|
|
+
|
|
+ vinsf $f24, $f17, 0, $f17
|
|
+ addl $20, 16*SIZE, $20
|
|
+ vinsf $f25, $f17, 2, $f17
|
|
+ addl $18, 16*SIZE, $18
|
|
+
|
|
+ vinsf $f26, $f16, 1, $f16
|
|
+ subl $4, 1, $4
|
|
+ vinsf $f27, $f16, 3, $f16
|
|
+ nop
|
|
+
|
|
+ vinsf $f12, $f19, 0, $f19
|
|
+ vinsf $f13, $f19, 2, $f19
|
|
+ vinsf $f14, $f18, 1, $f18
|
|
+ vinsf $f15, $f18, 3, $f18
|
|
+
|
|
+ VADD $f16, $f8, $f16
|
|
+ VLD $f8, 0*VEC_LEN*SIZE($20)
|
|
+ VADD $f17, $f28, $f17
|
|
+ VLD $f28, 1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VADD $f18, $f10, $f18
|
|
+ VLD $f10, 2*VEC_LEN*SIZE($20)
|
|
+ VADD $f19, $f11, $f19
|
|
+ VLD $f11, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VST $f16, -4*VEC_LEN*SIZE($20)
|
|
+ VST $f17, -3*VEC_LEN*SIZE($20)
|
|
+ VST $f18, -2*VEC_LEN*SIZE($20)
|
|
+ VST $f19, -1*VEC_LEN*SIZE($20)
|
|
+
|
|
+/*
|
|
+ MUL $f29, $f0, $f20
|
|
+ fillcs 9*SIZE($18)
|
|
+ MUL $f30, $f1, $f21
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ unop
|
|
+ MUL $f30, $f3, $f25
|
|
+ nop
|
|
+
|
|
+ MUL $f30, $f2, $f26
|
|
+ LD $f2, 2*SIZE($18)
|
|
+ MUL $f29, $f3, $f27
|
|
+ LD $f3, 3*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+ MUL $f30, $f4, $f22
|
|
+ LD $f4, 4*SIZE($18)
|
|
+
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ addl $20, 8*SIZE, $20
|
|
+ MUL $f29, $f5, $f23
|
|
+ LD $f5, 5*SIZE($18)
|
|
+
|
|
+ ADD $f16, $f8, $f16
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ MUL $f29, $f6, $f24
|
|
+ unop
|
|
+
|
|
+ ADD $f17, $f28, $f17
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ MUL $f30, $f7, $f25
|
|
+ unop
|
|
+
|
|
+ ADD $f18, $f10, $f18
|
|
+ LD $f10, 2*SIZE($20)
|
|
+ MUL $f30, $f6, $f26
|
|
+ LD $f6, 6*SIZE($18)
|
|
+
|
|
+ ADD $f19, $f11, $f19
|
|
+ LD $f11, 3*SIZE($20)
|
|
+ MUL $f29, $f7, $f27
|
|
+ LD $f7, 7*SIZE($18)
|
|
+
|
|
+ ST $f16,-8*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17,-7*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ ST $f18,-6*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ ST $f19,-5*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+
|
|
+ ADD $f16, $f12, $f16
|
|
+ LD $f12, 4*SIZE($20)
|
|
+ ADD $f17, $f13, $f17
|
|
+ LD $f13, 5*SIZE($20)
|
|
+ ADD $f18, $f14, $f18
|
|
+ LD $f14, 6*SIZE($20)
|
|
+ ADD $f19, $f15, $f19
|
|
+ LD $f15, 7*SIZE($20)
|
|
+
|
|
+ ST $f16,-4*SIZE($20)
|
|
+
|
|
+ ST $f17,-3*SIZE($20)
|
|
+
|
|
+
|
|
+ ST $f18,-2*SIZE($20)
|
|
+ nop
|
|
+ ST $f19,-1*SIZE($20)
|
|
+*/
|
|
+ bgt $4, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf $f0, 1, $f4
|
|
+ vextf $f0, 3, $f5
|
|
+ vextf $f1, 0, $f6
|
|
+ vextf $f1, 2, $f7
|
|
+
|
|
+ vextf $f2, 1, $f12
|
|
+ vextf $f2, 3, $f13
|
|
+ vextf $f3, 0, $f14
|
|
+ vextf $f3, 2, $f15
|
|
+
|
|
+ vinsf $f4, $f1, 0, $f1
|
|
+ vinsf $f5, $f1, 2, $f1
|
|
+ vinsf $f6, $f0, 1, $f0
|
|
+ vinsf $f7, $f0, 3, $f0
|
|
+
|
|
+ vinsf $f12, $f3, 0, $f3
|
|
+ vinsf $f13, $f3, 2, $f3
|
|
+ vinsf $f14, $f2, 1, $f2
|
|
+ vinsf $f15, $f2, 3, $f2
|
|
+
|
|
+ VMUL $f29, $f0, $f20
|
|
+ VMUL $f30, $f0, $f21
|
|
+ VMUL $f29, $f2, $f22
|
|
+ VMUL $f30, $f2, $f23
|
|
+
|
|
+ VMAD1 $f30, $f1, $f20, $f16
|
|
+ VMAD2 $f29, $f1, $f21, $f17
|
|
+ VMAD1 $f30, $f3, $f22, $f18
|
|
+ VMAD2 $f29, $f3, $f23, $f19
|
|
+
|
|
+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/
|
|
+ vextf $f16, 1, $f24
|
|
+ vextf $f16, 3, $f25
|
|
+ vextf $f17, 0, $f26
|
|
+ vextf $f17, 2, $f27
|
|
+
|
|
+ vextf $f18, 1, $f12
|
|
+ vextf $f18, 3, $f13
|
|
+ vextf $f19, 0, $f14
|
|
+ vextf $f19, 2, $f15
|
|
+
|
|
+ vinsf $f24, $f17, 0, $f17
|
|
+ vinsf $f25, $f17, 2, $f17
|
|
+ vinsf $f26, $f16, 1, $f16
|
|
+ vinsf $f27, $f16, 3, $f16
|
|
+
|
|
+ vinsf $f12, $f19, 0, $f19
|
|
+ vinsf $f13, $f19, 2, $f19
|
|
+ vinsf $f14, $f18, 1, $f18
|
|
+ vinsf $f15, $f18, 3, $f18
|
|
+
|
|
+ VADD $f16, $f8, $f16
|
|
+ VADD $f17, $f28, $f17
|
|
+ VADD $f18, $f10, $f18
|
|
+ VADD $f19, $f11, $f19
|
|
+
|
|
+ VST $f16, 0*VEC_LEN*SIZE($20)
|
|
+ VST $f17, 1*VEC_LEN*SIZE($20)
|
|
+ VST $f18, 2*VEC_LEN*SIZE($20)
|
|
+ VST $f19, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ addl $20, 16*SIZE, $20
|
|
+ ble $5, $End
|
|
+
|
|
+/* MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ MUL $f30, $f3, $f25
|
|
+ MUL $f30, $f2, $f26
|
|
+ MUL $f29, $f3, $f27
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ MUL $f30, $f4, $f22
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ MUL $f29, $f5, $f23
|
|
+
|
|
+ ADD $f16, $f8, $f16
|
|
+ MUL $f29, $f6, $f24
|
|
+ ADD $f17, $f28, $f17
|
|
+ MUL $f30, $f7, $f25
|
|
+
|
|
+ ADD $f18, $f10, $f18
|
|
+ MUL $f30, $f6, $f26
|
|
+ ADD $f19, $f11, $f19
|
|
+ MUL $f29, $f7, $f27
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ ST $f18, 2*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ ST $f19, 3*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+
|
|
+ ADD $f16, $f12, $f16
|
|
+ ADD $f17, $f13, $f17
|
|
+ ADD $f18, $f14, $f18
|
|
+ ADD $f19, $f15, $f19
|
|
+
|
|
+ ST $f16, 4*SIZE($20)
|
|
+ ST $f17, 5*SIZE($20)
|
|
+ ST $f18, 6*SIZE($20)
|
|
+ ST $f19, 7*SIZE($20)
|
|
+
|
|
+ unop
|
|
+ unop
|
|
+*/
|
|
+ .align 4
|
|
+
|
|
+$Remain:
|
|
+ subl $5, 1, $6
|
|
+ ble $5, $End
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ addl $18, 2*SIZE, $18
|
|
+ ble $6, $RemainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ subl $6, 1, $6
|
|
+ MUL $f30, $f1, $f21
|
|
+ addl $20, 2*SIZE, $20
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, $f16
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ ADD $f17, $f28, $f17
|
|
+ LD $f28, 1*SIZE($20)
|
|
+
|
|
+ ST $f16,-2*SIZE($20)
|
|
+ addl $18, 2*SIZE, $18
|
|
+ ST $f17,-1*SIZE($20)
|
|
+ bgt $6, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+$RemainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, $f16
|
|
+ ADD $f17, $f28, $f17
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ nop
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ nop
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ ldi $sp, 64($sp)
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_Y_ACCESS:
|
|
+ and $18, (VEC_LEN*SIZE-1), $7
|
|
+ nop
|
|
+ nop
|
|
+ bgt $7, $UnAlign_XY_ACCESS
|
|
+ .align 4
|
|
+/*
|
|
+ Unalign access Y, Align access X
|
|
+*/
|
|
+
|
|
+ VLD_UL $f8, 0*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f12, 1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f28, 1*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f13, 2*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f10, 2*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f14, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f11, 3*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f15, 4*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD $f3, 3*VEC_LEN*SIZE($18)
|
|
+
|
|
+ vbisw $f8, $f12, $f8
|
|
+ vbisw $f28, $f13, $f28
|
|
+ vbisw $f10, $f14, $f10
|
|
+ vbisw $f11, $f15, $f11
|
|
+
|
|
+ addl $18, 16*SIZE, $18
|
|
+ ble $4, $UnAlign_Y_MainLoopEnd
|
|
+ .align 4
|
|
+$UnAlign_Y_MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+ fillcs PREFETCHSIZE * SIZE($18)
|
|
+
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf $f0, 1, $f4
|
|
+ vextf $f0, 3, $f5
|
|
+ vextf $f1, 0, $f6
|
|
+ vextf $f1, 2, $f7
|
|
+
|
|
+ vextf $f2, 1, $f12
|
|
+ vextf $f2, 3, $f13
|
|
+ vextf $f3, 0, $f14
|
|
+ vextf $f3, 2, $f15
|
|
+
|
|
+ vinsf $f4, $f1, 0, $f1
|
|
+ vinsf $f5, $f1, 2, $f1
|
|
+ vinsf $f6, $f0, 1, $f0
|
|
+ vinsf $f7, $f0, 3, $f0
|
|
+
|
|
+ vinsf $f12, $f3, 0, $f3
|
|
+ vinsf $f13, $f3, 2, $f3
|
|
+ vinsf $f14, $f2, 1, $f2
|
|
+ vinsf $f15, $f2, 3, $f2
|
|
+
|
|
+/*Compute*/
|
|
+ VMUL $f29, $f0, $f20
|
|
+ VMUL $f30, $f0, $f21
|
|
+ VMUL $f29, $f2, $f22
|
|
+ VMUL $f30, $f2, $f23
|
|
+
|
|
+
|
|
+ VMAD1 $f30, $f1, $f20, $f16
|
|
+ VMAD2 $f29, $f1, $f21, $f17
|
|
+ VMAD1 $f30, $f3, $f22, $f18
|
|
+ VMAD2 $f29, $f3, $f23, $f19
|
|
+
|
|
+ VLD $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD $f3, 3*VEC_LEN*SIZE($18)
|
|
+
|
|
+
|
|
+/*combine the real & image vector to complex vector*/
|
|
+ vextf $f16, 1, $f24
|
|
+ vextf $f16, 3, $f25
|
|
+ vextf $f17, 0, $f26
|
|
+ vextf $f17, 2, $f27
|
|
+
|
|
+ vextf $f18, 1, $f12
|
|
+ vextf $f18, 3, $f13
|
|
+ vextf $f19, 0, $f14
|
|
+ vextf $f19, 2, $f15
|
|
+
|
|
+ vinsf $f24, $f17, 0, $f17
|
|
+ addl $20, 16*SIZE, $20
|
|
+ vinsf $f25, $f17, 2, $f17
|
|
+ addl $18, 16*SIZE, $18
|
|
+
|
|
+ vinsf $f26, $f16, 1, $f16
|
|
+ subl $4, 1, $4
|
|
+ vinsf $f27, $f16, 3, $f16
|
|
+ nop
|
|
+
|
|
+ vinsf $f12, $f19, 0, $f19
|
|
+ vinsf $f13, $f19, 2, $f19
|
|
+ vinsf $f14, $f18, 1, $f18
|
|
+ vinsf $f15, $f18, 3, $f18
|
|
+
|
|
+ VADD $f16, $f8, $f16
|
|
+ VLD_UL $f8, 0*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f12, 1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VADD $f17, $f28, $f17
|
|
+ VLD_UL $f28, 1*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f13, 2*VEC_LEN*SIZE($20)
|
|
+
|
|
+
|
|
+ VADD $f18, $f10, $f18
|
|
+ VLD_UL $f10, 2*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f14, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VADD $f19, $f11, $f19
|
|
+ VLD_UL $f11, 3*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f15, 4*VEC_LEN*SIZE($20)
|
|
+
|
|
+
|
|
+ vbisw $f8, $f12, $f8
|
|
+ VST_UL $f16, -4*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f16, -3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ vbisw $f28, $f13, $f28
|
|
+ VST_UL $f17, -3*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f17, -2*VEC_LEN*SIZE($20)
|
|
+
|
|
+ vbisw $f10, $f14, $f10
|
|
+ VST_UL $f18, -2*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f18, -1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ vbisw $f11, $f15, $f11
|
|
+ VST_UL $f19, -1*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f19, 0*VEC_LEN*SIZE($20)
|
|
+
|
|
+ bgt $4, $UnAlign_Y_MainLoop
|
|
+
|
|
+$UnAlign_Y_MainLoopEnd:
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf $f0, 1, $f4
|
|
+ vextf $f0, 3, $f5
|
|
+ vextf $f1, 0, $f6
|
|
+ vextf $f1, 2, $f7
|
|
+
|
|
+ vextf $f2, 1, $f12
|
|
+ vextf $f2, 3, $f13
|
|
+ vextf $f3, 0, $f14
|
|
+ vextf $f3, 2, $f15
|
|
+
|
|
+ vinsf $f4, $f1, 0, $f1
|
|
+ vinsf $f5, $f1, 2, $f1
|
|
+ vinsf $f6, $f0, 1, $f0
|
|
+ vinsf $f7, $f0, 3, $f0
|
|
+
|
|
+ vinsf $f12, $f3, 0, $f3
|
|
+ vinsf $f13, $f3, 2, $f3
|
|
+ vinsf $f14, $f2, 1, $f2
|
|
+ vinsf $f15, $f2, 3, $f2
|
|
+
|
|
+ VMUL $f29, $f0, $f20
|
|
+ VMUL $f30, $f0, $f21
|
|
+ VMUL $f29, $f2, $f22
|
|
+ VMUL $f30, $f2, $f23
|
|
+
|
|
+ VMAD1 $f30, $f1, $f20, $f16
|
|
+ VMAD2 $f29, $f1, $f21, $f17
|
|
+ VMAD1 $f30, $f3, $f22, $f18
|
|
+ VMAD2 $f29, $f3, $f23, $f19
|
|
+
|
|
+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/
|
|
+ vextf $f16, 1, $f24
|
|
+ vextf $f16, 3, $f25
|
|
+ vextf $f17, 0, $f26
|
|
+ vextf $f17, 2, $f27
|
|
+
|
|
+ vextf $f18, 1, $f12
|
|
+ vextf $f18, 3, $f13
|
|
+ vextf $f19, 0, $f14
|
|
+ vextf $f19, 2, $f15
|
|
+
|
|
+ vinsf $f24, $f17, 0, $f17
|
|
+ vinsf $f25, $f17, 2, $f17
|
|
+ vinsf $f26, $f16, 1, $f16
|
|
+ vinsf $f27, $f16, 3, $f16
|
|
+
|
|
+ vinsf $f12, $f19, 0, $f19
|
|
+ vinsf $f13, $f19, 2, $f19
|
|
+ vinsf $f14, $f18, 1, $f18
|
|
+ vinsf $f15, $f18, 3, $f18
|
|
+
|
|
+ VADD $f16, $f8, $f16
|
|
+ VADD $f17, $f28, $f17
|
|
+ VADD $f18, $f10, $f18
|
|
+ VADD $f19, $f11, $f19
|
|
+
|
|
+ VST_UL $f16, 0*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f16, 1*VEC_LEN*SIZE($20)
|
|
+ VST_UL $f17, 1*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f17, 2*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VST_UL $f18, 2*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f18, 3*VEC_LEN*SIZE($20)
|
|
+ VST_UL $f19, 3*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f19, 4*VEC_LEN*SIZE($20)
|
|
+
|
|
+ addl $20, 16*SIZE, $20
|
|
+ ble $5, $End
|
|
+
|
|
+ jmp $Remain
|
|
+
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$UnAlign_X_ACCESS:
|
|
+ and $20, (VEC_LEN*SIZE-1), $6
|
|
+ nop
|
|
+ nop
|
|
+ bgt $6, $UnAlign_XY_ACCESS
|
|
+
|
|
+ .align 4
|
|
+/*
|
|
+ Unalign access X, Align access Y
|
|
+*/
|
|
+ VLD_UL $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f4, 1*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f5, 2*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f6, 3*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f3, 3*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f7, 4*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD $f8, 0*VEC_LEN*SIZE($20)
|
|
+ VLD $f28, 1*VEC_LEN*SIZE($20)
|
|
+ VLD $f10, 2*VEC_LEN*SIZE($20)
|
|
+ VLD $f11, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ vbisw $f0, $f4, $f0
|
|
+ vbisw $f1, $f5, $f1
|
|
+ vbisw $f2, $f6, $f2
|
|
+ vbisw $f3, $f7, $f3
|
|
+
|
|
+ addl $18, 16*SIZE, $18
|
|
+ ble $4, $UnAlign_X_MainLoopEnd
|
|
+ .align 4
|
|
+$UnAlign_X_MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+ fillcs PREFETCHSIZE * SIZE($18)
|
|
+
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf $f0, 1, $f4
|
|
+ vextf $f0, 3, $f5
|
|
+ vextf $f1, 0, $f6
|
|
+ vextf $f1, 2, $f7
|
|
+
|
|
+ vextf $f2, 1, $f12
|
|
+ vextf $f2, 3, $f13
|
|
+ vextf $f3, 0, $f14
|
|
+ vextf $f3, 2, $f15
|
|
+
|
|
+ vinsf $f4, $f1, 0, $f1
|
|
+ vinsf $f5, $f1, 2, $f1
|
|
+ vinsf $f6, $f0, 1, $f0
|
|
+ vinsf $f7, $f0, 3, $f0
|
|
+
|
|
+ vinsf $f12, $f3, 0, $f3
|
|
+ vinsf $f13, $f3, 2, $f3
|
|
+ vinsf $f14, $f2, 1, $f2
|
|
+ vinsf $f15, $f2, 3, $f2
|
|
+
|
|
+/*Compute*/
|
|
+ VMUL $f29, $f0, $f20
|
|
+ VMUL $f30, $f0, $f21
|
|
+ VMUL $f29, $f2, $f22
|
|
+ VMUL $f30, $f2, $f23
|
|
+
|
|
+
|
|
+ VMAD1 $f30, $f1, $f20, $f16
|
|
+ VMAD2 $f29, $f1, $f21, $f17
|
|
+ VMAD1 $f30, $f3, $f22, $f18
|
|
+ VMAD2 $f29, $f3, $f23, $f19
|
|
+/*
|
|
+ VLD $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD $f3, 3*VEC_LEN*SIZE($18)
|
|
+*/
|
|
+ VLD_UL $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f4, 1*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f5, 2*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f6, 3*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f3, 3*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f7, 4*VEC_LEN*SIZE($18)
|
|
+
|
|
+/*combine the real & image vector to complex vector*/
|
|
+ vextf $f16, 1, $f24
|
|
+ vextf $f16, 3, $f25
|
|
+ vextf $f17, 0, $f26
|
|
+ vextf $f17, 2, $f27
|
|
+
|
|
+ vextf $f18, 1, $f12
|
|
+ vextf $f18, 3, $f13
|
|
+ vextf $f19, 0, $f14
|
|
+ vextf $f19, 2, $f15
|
|
+
|
|
+ vbisw $f0, $f4, $f0
|
|
+ vbisw $f1, $f5, $f1
|
|
+ vbisw $f2, $f6, $f2
|
|
+ vbisw $f3, $f7, $f3
|
|
+
|
|
+ vinsf $f24, $f17, 0, $f17
|
|
+ addl $20, 16*SIZE, $20
|
|
+ vinsf $f25, $f17, 2, $f17
|
|
+ addl $18, 16*SIZE, $18
|
|
+
|
|
+ vinsf $f26, $f16, 1, $f16
|
|
+ subl $4, 1, $4
|
|
+ vinsf $f27, $f16, 3, $f16
|
|
+ nop
|
|
+
|
|
+ vinsf $f12, $f19, 0, $f19
|
|
+ vinsf $f13, $f19, 2, $f19
|
|
+ vinsf $f14, $f18, 1, $f18
|
|
+ vinsf $f15, $f18, 3, $f18
|
|
+
|
|
+ VADD $f16, $f8, $f16
|
|
+ VLD $f8, 0*VEC_LEN*SIZE($20)
|
|
+ VADD $f17, $f28, $f17
|
|
+ VLD $f28, 1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VADD $f18, $f10, $f18
|
|
+ VLD $f10, 2*VEC_LEN*SIZE($20)
|
|
+ VADD $f19, $f11, $f19
|
|
+ VLD $f11, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VST $f16, -4*VEC_LEN*SIZE($20)
|
|
+ VST $f17, -3*VEC_LEN*SIZE($20)
|
|
+ VST $f18, -2*VEC_LEN*SIZE($20)
|
|
+ VST $f19, -1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ bgt $4, $UnAlign_X_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_X_MainLoopEnd:
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf $f0, 1, $f4
|
|
+ vextf $f0, 3, $f5
|
|
+ vextf $f1, 0, $f6
|
|
+ vextf $f1, 2, $f7
|
|
+
|
|
+ vextf $f2, 1, $f12
|
|
+ vextf $f2, 3, $f13
|
|
+ vextf $f3, 0, $f14
|
|
+ vextf $f3, 2, $f15
|
|
+
|
|
+ vinsf $f4, $f1, 0, $f1
|
|
+ vinsf $f5, $f1, 2, $f1
|
|
+ vinsf $f6, $f0, 1, $f0
|
|
+ vinsf $f7, $f0, 3, $f0
|
|
+
|
|
+ vinsf $f12, $f3, 0, $f3
|
|
+ vinsf $f13, $f3, 2, $f3
|
|
+ vinsf $f14, $f2, 1, $f2
|
|
+ vinsf $f15, $f2, 3, $f2
|
|
+
|
|
+ VMUL $f29, $f0, $f20
|
|
+ VMUL $f30, $f0, $f21
|
|
+ VMUL $f29, $f2, $f22
|
|
+ VMUL $f30, $f2, $f23
|
|
+
|
|
+ VMAD1 $f30, $f1, $f20, $f16
|
|
+ VMAD2 $f29, $f1, $f21, $f17
|
|
+ VMAD1 $f30, $f3, $f22, $f18
|
|
+ VMAD2 $f29, $f3, $f23, $f19
|
|
+
|
|
+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/
|
|
+ vextf $f16, 1, $f24
|
|
+ vextf $f16, 3, $f25
|
|
+ vextf $f17, 0, $f26
|
|
+ vextf $f17, 2, $f27
|
|
+
|
|
+ vextf $f18, 1, $f12
|
|
+ vextf $f18, 3, $f13
|
|
+ vextf $f19, 0, $f14
|
|
+ vextf $f19, 2, $f15
|
|
+
|
|
+ vinsf $f24, $f17, 0, $f17
|
|
+ vinsf $f25, $f17, 2, $f17
|
|
+ vinsf $f26, $f16, 1, $f16
|
|
+ vinsf $f27, $f16, 3, $f16
|
|
+
|
|
+ vinsf $f12, $f19, 0, $f19
|
|
+ vinsf $f13, $f19, 2, $f19
|
|
+ vinsf $f14, $f18, 1, $f18
|
|
+ vinsf $f15, $f18, 3, $f18
|
|
+
|
|
+ VADD $f16, $f8, $f16
|
|
+ VADD $f17, $f28, $f17
|
|
+ VADD $f18, $f10, $f18
|
|
+ VADD $f19, $f11, $f19
|
|
+
|
|
+ VST $f16, 0*VEC_LEN*SIZE($20)
|
|
+ VST $f17, 1*VEC_LEN*SIZE($20)
|
|
+ VST $f18, 2*VEC_LEN*SIZE($20)
|
|
+ VST $f19, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ addl $20, 16*SIZE, $20
|
|
+ ble $5, $End
|
|
+
|
|
+ jmp $Remain
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_XY_ACCESS:
|
|
+/*
|
|
+ Unalign access X & Y
|
|
+*/
|
|
+ VLD_UL $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f4, 1*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f5, 2*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f6, 3*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f3, 3*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f7, 4*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f8, 0*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f12, 1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f28, 1*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f13, 2*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f10, 2*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f14, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VLD_UL $f11, 3*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f15, 4*VEC_LEN*SIZE($20)
|
|
+
|
|
+ vbisw $f0, $f4, $f0
|
|
+ vbisw $f1, $f5, $f1
|
|
+ vbisw $f2, $f6, $f2
|
|
+ vbisw $f3, $f7, $f3
|
|
+
|
|
+ vbisw $f8, $f12, $f8
|
|
+ vbisw $f28, $f13, $f28
|
|
+ vbisw $f10, $f14, $f10
|
|
+ vbisw $f11, $f15, $f11
|
|
+
|
|
+ addl $18, 16*SIZE, $18
|
|
+ ble $4, $UnAlign_MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE($20)
|
|
+ fillcs PREFETCHSIZE * SIZE($18)
|
|
+
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf $f0, 1, $f4
|
|
+ vextf $f0, 3, $f5
|
|
+ vextf $f1, 0, $f6
|
|
+ vextf $f1, 2, $f7
|
|
+
|
|
+ vextf $f2, 1, $f12
|
|
+ vextf $f2, 3, $f13
|
|
+ vextf $f3, 0, $f14
|
|
+ vextf $f3, 2, $f15
|
|
+
|
|
+ vinsf $f4, $f1, 0, $f1
|
|
+ vinsf $f5, $f1, 2, $f1
|
|
+ vinsf $f6, $f0, 1, $f0
|
|
+ vinsf $f7, $f0, 3, $f0
|
|
+
|
|
+ vinsf $f12, $f3, 0, $f3
|
|
+ vinsf $f13, $f3, 2, $f3
|
|
+ vinsf $f14, $f2, 1, $f2
|
|
+ vinsf $f15, $f2, 3, $f2
|
|
+
|
|
+/*Compute*/
|
|
+ VMUL $f29, $f0, $f20
|
|
+ VMUL $f30, $f0, $f21
|
|
+ VMUL $f29, $f2, $f22
|
|
+ VMUL $f30, $f2, $f23
|
|
+
|
|
+
|
|
+ VMAD1 $f30, $f1, $f20, $f16
|
|
+ VMAD2 $f29, $f1, $f21, $f17
|
|
+ VMAD1 $f30, $f3, $f22, $f18
|
|
+ VMAD2 $f29, $f3, $f23, $f19
|
|
+/*
|
|
+ VLD $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD $f3, 3*VEC_LEN*SIZE($18)
|
|
+*/
|
|
+ VLD_UL $f0, 0*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f4, 1*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f1, 1*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f5, 2*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f2, 2*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f6, 3*VEC_LEN*SIZE($18)
|
|
+
|
|
+ VLD_UL $f3, 3*VEC_LEN*SIZE($18)
|
|
+ VLD_UH $f7, 4*VEC_LEN*SIZE($18)
|
|
+
|
|
+/*combine the real & image vector to complex vector*/
|
|
+ vextf $f16, 1, $f24
|
|
+ vextf $f16, 3, $f25
|
|
+ vextf $f17, 0, $f26
|
|
+ vextf $f17, 2, $f27
|
|
+
|
|
+ vextf $f18, 1, $f12
|
|
+ vextf $f18, 3, $f13
|
|
+ vextf $f19, 0, $f14
|
|
+ vextf $f19, 2, $f15
|
|
+
|
|
+ vbisw $f0, $f4, $f0
|
|
+ vbisw $f1, $f5, $f1
|
|
+ vbisw $f2, $f6, $f2
|
|
+ vbisw $f3, $f7, $f3
|
|
+
|
|
+ vinsf $f24, $f17, 0, $f17
|
|
+ addl $20, 16*SIZE, $20
|
|
+ vinsf $f25, $f17, 2, $f17
|
|
+ addl $18, 16*SIZE, $18
|
|
+
|
|
+ vinsf $f26, $f16, 1, $f16
|
|
+ subl $4, 1, $4
|
|
+ vinsf $f27, $f16, 3, $f16
|
|
+ nop
|
|
+
|
|
+ vinsf $f12, $f19, 0, $f19
|
|
+ vinsf $f13, $f19, 2, $f19
|
|
+ vinsf $f14, $f18, 1, $f18
|
|
+ vinsf $f15, $f18, 3, $f18
|
|
+
|
|
+ VADD $f16, $f8, $f16
|
|
+ VLD_UL $f8, 0*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f12, 1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VADD $f17, $f28, $f17
|
|
+ VLD_UL $f28, 1*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f13, 2*VEC_LEN*SIZE($20)
|
|
+
|
|
+
|
|
+ VADD $f18, $f10, $f18
|
|
+ VLD_UL $f10, 2*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f14, 3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VADD $f19, $f11, $f19
|
|
+ VLD_UL $f11, 3*VEC_LEN*SIZE($20)
|
|
+ VLD_UH $f15, 4*VEC_LEN*SIZE($20)
|
|
+
|
|
+/*
|
|
+ VST $f16, -4*VEC_LEN*SIZE($20)
|
|
+ VST $f17, -3*VEC_LEN*SIZE($20)
|
|
+ VST $f18, -2*VEC_LEN*SIZE($20)
|
|
+ VST $f19, -1*VEC_LEN*SIZE($20)
|
|
+*/
|
|
+
|
|
+ vbisw $f8, $f12, $f8
|
|
+ VST_UL $f16, -4*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f16, -3*VEC_LEN*SIZE($20)
|
|
+
|
|
+ vbisw $f28, $f13, $f28
|
|
+ VST_UL $f17, -3*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f17, -2*VEC_LEN*SIZE($20)
|
|
+
|
|
+ vbisw $f10, $f14, $f10
|
|
+ VST_UL $f18, -2*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f18, -1*VEC_LEN*SIZE($20)
|
|
+
|
|
+ vbisw $f11, $f15, $f11
|
|
+ VST_UL $f19, -1*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f19, 0*VEC_LEN*SIZE($20)
|
|
+
|
|
+ bgt $4, $UnAlign_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_MainLoopEnd:
|
|
+
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf $f0, 1, $f4
|
|
+ vextf $f0, 3, $f5
|
|
+ vextf $f1, 0, $f6
|
|
+ vextf $f1, 2, $f7
|
|
+
|
|
+ vextf $f2, 1, $f12
|
|
+ vextf $f2, 3, $f13
|
|
+ vextf $f3, 0, $f14
|
|
+ vextf $f3, 2, $f15
|
|
+
|
|
+ vinsf $f4, $f1, 0, $f1
|
|
+ vinsf $f5, $f1, 2, $f1
|
|
+ vinsf $f6, $f0, 1, $f0
|
|
+ vinsf $f7, $f0, 3, $f0
|
|
+
|
|
+ vinsf $f12, $f3, 0, $f3
|
|
+ vinsf $f13, $f3, 2, $f3
|
|
+ vinsf $f14, $f2, 1, $f2
|
|
+ vinsf $f15, $f2, 3, $f2
|
|
+
|
|
+ VMUL $f29, $f0, $f20
|
|
+ VMUL $f30, $f0, $f21
|
|
+ VMUL $f29, $f2, $f22
|
|
+ VMUL $f30, $f2, $f23
|
|
+
|
|
+ VMAD1 $f30, $f1, $f20, $f16
|
|
+ VMAD2 $f29, $f1, $f21, $f17
|
|
+ VMAD1 $f30, $f3, $f22, $f18
|
|
+ VMAD2 $f29, $f3, $f23, $f19
|
|
+
|
|
+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/
|
|
+ vextf $f16, 1, $f24
|
|
+ vextf $f16, 3, $f25
|
|
+ vextf $f17, 0, $f26
|
|
+ vextf $f17, 2, $f27
|
|
+
|
|
+ vextf $f18, 1, $f12
|
|
+ vextf $f18, 3, $f13
|
|
+ vextf $f19, 0, $f14
|
|
+ vextf $f19, 2, $f15
|
|
+
|
|
+ vinsf $f24, $f17, 0, $f17
|
|
+ vinsf $f25, $f17, 2, $f17
|
|
+ vinsf $f26, $f16, 1, $f16
|
|
+ vinsf $f27, $f16, 3, $f16
|
|
+
|
|
+ vinsf $f12, $f19, 0, $f19
|
|
+ vinsf $f13, $f19, 2, $f19
|
|
+ vinsf $f14, $f18, 1, $f18
|
|
+ vinsf $f15, $f18, 3, $f18
|
|
+
|
|
+ VADD $f16, $f8, $f16
|
|
+ VADD $f17, $f28, $f17
|
|
+ VADD $f18, $f10, $f18
|
|
+ VADD $f19, $f11, $f19
|
|
+
|
|
+ VST_UL $f16, 0*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f16, 1*VEC_LEN*SIZE($20)
|
|
+ VST_UL $f17, 1*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f17, 2*VEC_LEN*SIZE($20)
|
|
+
|
|
+ VST_UL $f18, 2*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f18, 3*VEC_LEN*SIZE($20)
|
|
+ VST_UL $f19, 3*VEC_LEN*SIZE($20)
|
|
+ VST_UH $f19, 4*VEC_LEN*SIZE($20)
|
|
+
|
|
+ addl $20, 16*SIZE, $20
|
|
+ ble $5, $End
|
|
+
|
|
+ jmp $Remain
|
|
+ .align 4
|
|
+/*Unloop 4 complex = 8 float/double*/
|
|
+$Sub:
|
|
+ sra $16, 2, $4
|
|
+ and $16, 3, $5
|
|
+ SXSUBL $16, SIZE, $22
|
|
+ addl $22, $22, $22 # Complex
|
|
+ .align 4
|
|
+
|
|
+ addl $19, $19, $19 # Complex
|
|
+ addl $21, $21, $21 # Complex
|
|
+
|
|
+ ble $4, $SubRemain
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f2, 0*SIZE($18)
|
|
+ LD $f3, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f4, 0*SIZE($18)
|
|
+ LD $f5, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f6, 0*SIZE($18)
|
|
+ LD $f7, 1*SIZE($18)
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $24
|
|
+
|
|
+ LD $f10, 0*SIZE($24)
|
|
+ LD $f11, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ LD $f12, 0*SIZE($24)
|
|
+ LD $f13, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ LD $f14, 0*SIZE($24)
|
|
+ LD $f15, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ subl $4, 1, $4
|
|
+ ble $4, $SubMainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$SubMainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ unop
|
|
+ MUL $f30, $f1, $f21
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ SXADDQ $19, $18, $18
|
|
+ MUL $f30, $f3, $f25
|
|
+ unop
|
|
+
|
|
+ MUL $f30, $f2, $f26
|
|
+ LD $f2, 0*SIZE($18)
|
|
+ MUL $f29, $f3, $f27
|
|
+ LD $f3, 1*SIZE($18)
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ SXADDQ $19, $18, $18
|
|
+ MUL $f29, $f4, $f20
|
|
+ unop
|
|
+
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ unop
|
|
+ MUL $f30, $f5, $f21
|
|
+ unop
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+ MUL $f30, $f4, $f22
|
|
+ LD $f4, 0*SIZE($18)
|
|
+
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ unop
|
|
+ MUL $f29, $f5, $f23
|
|
+ LD $f5, 1*SIZE($18)
|
|
+
|
|
+ ADD $f16, $f8, $f16
|
|
+ LD $f8, 0*SIZE($24)
|
|
+ MUL $f29, $f6, $f24
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ ADD $f17, $f28, $f17
|
|
+ LD $f28, 1*SIZE($24)
|
|
+ MUL $f30, $f7, $f25
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ADD $f18, $f10, $f18
|
|
+ LD $f10, 0*SIZE($24)
|
|
+ MUL $f30, $f6, $f26
|
|
+ LD $f6, 0*SIZE($18)
|
|
+
|
|
+ ADD $f19, $f11, $f19
|
|
+ LD $f11, 1*SIZE($24)
|
|
+ MUL $f29, $f7, $f27
|
|
+ LD $f7, 1*SIZE($18)
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ SXADDQ $19, $18, $18
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ unop
|
|
+
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ unop
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ SXADDQ $21, $24, $24
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ unop
|
|
+
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ unop
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ADD $f16, $f12, $f16
|
|
+ unop
|
|
+ LD $f12, 0*SIZE($24)
|
|
+ unop
|
|
+
|
|
+ ADD $f17, $f13, $f17
|
|
+ unop
|
|
+ LD $f13, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ADD $f18, $f14, $f18
|
|
+ subl $4, 1, $4
|
|
+ LD $f14, 0*SIZE($24)
|
|
+ unop
|
|
+
|
|
+ ADD $f19, $f15, $f19
|
|
+ unop
|
|
+ LD $f15, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ unop
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ bgt $4, $SubMainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubMainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ MUL $f29, $f2, $f24
|
|
+ MUL $f30, $f3, $f25
|
|
+ MUL $f30, $f2, $f26
|
|
+ MUL $f29, $f3, $f27
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ MUL $f29, $f4, $f20
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ MUL $f30, $f5, $f21
|
|
+
|
|
+ ADD1 $f24, $f25, $f18
|
|
+ MUL $f30, $f4, $f22
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ MUL $f29, $f5, $f23
|
|
+
|
|
+ ADD $f16, $f8, $f16
|
|
+ MUL $f29, $f6, $f24
|
|
+ ADD $f17, $f28, $f17
|
|
+ MUL $f30, $f7, $f25
|
|
+
|
|
+ ADD $f18, $f10, $f18
|
|
+ MUL $f30, $f6, $f26
|
|
+ ADD $f19, $f11, $f19
|
|
+ MUL $f29, $f7, $f27
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ ADD2 $f22, $f23, $f17
|
|
+
|
|
+ SXADDQ $21, $20, $20
|
|
+ nop
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ADD1 $f24, $f25, $f18
|
|
+
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ ADD2 $f26, $f27, $f19
|
|
+ SXADDQ $21, $20, $20
|
|
+ ADD $f16, $f12, $f16
|
|
+
|
|
+ ADD $f17, $f13, $f17
|
|
+ ADD $f18, $f14, $f18
|
|
+ ADD $f19, $f15, $f19
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+
|
|
+ ST $f18, 0*SIZE($20)
|
|
+ ST $f19, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ ble $5, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ subl $5, 1, $6
|
|
+ ble $5, $SubEnd
|
|
+ LD $f0, 0*SIZE($18)
|
|
+ LD $f1, 1*SIZE($18)
|
|
+
|
|
+ LD $f8, 0*SIZE($20)
|
|
+ LD $f28, 1*SIZE($20)
|
|
+ SXADDQ $19, $18, $18
|
|
+ SXADDQ $21, $20, $24
|
|
+ ble $6, $SubRemainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ LD $f0, 0*SIZE($18)
|
|
+
|
|
+ MUL $f29, $f1, $f23
|
|
+ LD $f1, 1*SIZE($18)
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ SXADDQ $19, $18, $18
|
|
+
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ nop
|
|
+ ADD $f16, $f8, $f16
|
|
+ LD $f8, 0*SIZE($24)
|
|
+
|
|
+ ADD $f17, $f28, $f17
|
|
+ LD $f28, 1*SIZE($24)
|
|
+ SXADDQ $21, $24, $24
|
|
+ subl $6, 1, $6
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ SXADDQ $21, $20, $20
|
|
+ bgt $6, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoopEnd:
|
|
+ MUL $f29, $f0, $f20
|
|
+ MUL $f30, $f1, $f21
|
|
+ MUL $f30, $f0, $f22
|
|
+ MUL $f29, $f1, $f23
|
|
+
|
|
+ ADD1 $f20, $f21, $f16
|
|
+ ADD2 $f22, $f23, $f17
|
|
+ ADD $f16, $f8, $f16
|
|
+ ADD $f17, $f28, $f17
|
|
+
|
|
+ ST $f16, 0*SIZE($20)
|
|
+ nop
|
|
+ ST $f17, 1*SIZE($20)
|
|
+ nop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ ldi $sp, 64($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zdot.S b/kernel/sw_64/zdot.S
|
|
new file mode 100644
|
|
index 0000000..114a7a3
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zdot.S
|
|
@@ -0,0 +1,583 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define XX $21
|
|
+#define YY $23
|
|
+
|
|
+#define I $5
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f2
|
|
+#define s3 $f30
|
|
+#define s4 $f3
|
|
+
|
|
+#define a0 $f10
|
|
+#define a1 $f11
|
|
+#define a2 $f12
|
|
+#define a3 $f13
|
|
+#define a4 $f14
|
|
+#define a5 $f15
|
|
+#define a6 $f16
|
|
+#define a7 $f17
|
|
+
|
|
+#define b0 $f18
|
|
+#define b1 $f19
|
|
+#define b2 $f20
|
|
+#define b3 $f21
|
|
+#define b4 $f22
|
|
+#define b5 $f23
|
|
+#define b6 $f24
|
|
+#define b7 $f25
|
|
+
|
|
+#define t0 $f26
|
|
+#define t1 $f27
|
|
+#define t2 $f28
|
|
+#define t3 $f29
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 24, $26, 0
|
|
+
|
|
+ ldi $sp, -24($sp)
|
|
+ fclr s0
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 16($sp)
|
|
+ fclr s1
|
|
+
|
|
+ fclr s2
|
|
+ addl INCX, INCX, INCX
|
|
+ fclr s3
|
|
+ ble N, $L999
|
|
+
|
|
+ addl INCY, INCY, INCY
|
|
+ fclr t0
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+
|
|
+ srl N, 3, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+
|
|
+ subl I, 1, I
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ MUL a1, b0, t2
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ #unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a1, b0, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ subl I, 1, I
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a7, b7, t3
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a1, b0, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ unop
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ MUL a1, b0, t2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ MUL a1, b1, t3
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ MUL a2, b2, t0
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ MUL a2, b3, t1
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ MUL a3, b2, t2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ MUL a4, b5, t1
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ MUL a5, b4, t2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ MUL a5, b5, t3
|
|
+
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ MUL a6, b6, t0
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ MUL a6, b7, t1
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ MUL a7, b6, t2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L998
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ subl I, 1, I
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ble I, $L28
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ mov X, XX
|
|
+ MUL a0, b0, t0
|
|
+ mov Y, YY
|
|
+
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ LD a0, 0 * SIZE(XX)
|
|
+ MUL a1, b0, t2
|
|
+ LD b0, 0 * SIZE(YY)
|
|
+
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ subl I, 1, I
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(XX)
|
|
+
|
|
+ LD b1, 1 * SIZE(YY)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ MUL a0, b1, t1
|
|
+
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ MUL a1, b0, t2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+ MUL a1, b1, t3
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ ADD s0, t0, s4
|
|
+ fmov s4,s0
|
|
+ ADD s1, t1, s4
|
|
+ fmov s4,s1
|
|
+ ADD s2, t2, s4
|
|
+ fmov s4,s2
|
|
+ ADD s3, t3, s4
|
|
+ fmov s4,s3
|
|
+
|
|
+#ifndef CONJ
|
|
+ SUB s0, s3, s4
|
|
+ fmov s4,s0
|
|
+ ADD s1, s2, s4
|
|
+ fmov s4,s1
|
|
+#else
|
|
+ ADD s0, s3, s4
|
|
+ fmov s4,s0
|
|
+ SUB s1, s2, s4
|
|
+ fmov s4,s1
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 16($sp)
|
|
+ ldi $sp, 24($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zdot.S.bak b/kernel/sw_64/zdot.S.bak
|
|
new file mode 100644
|
|
index 0000000..d10673c
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zdot.S.bak
|
|
@@ -0,0 +1,500 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define XX $21
|
|
+#define YY $23
|
|
+
|
|
+#define I $5
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f2
|
|
+#define s3 $f30
|
|
+
|
|
+#define a0 $f10
|
|
+#define a1 $f11
|
|
+#define a2 $f12
|
|
+#define a3 $f13
|
|
+#define a4 $f14
|
|
+#define a5 $f15
|
|
+#define a6 $f16
|
|
+#define a7 $f17
|
|
+
|
|
+#define b0 $f18
|
|
+#define b1 $f19
|
|
+#define b2 $f20
|
|
+#define b3 $f21
|
|
+#define b4 $f22
|
|
+#define b5 $f23
|
|
+#define b6 $f24
|
|
+#define b7 $f25
|
|
+
|
|
+#define t0 $f26
|
|
+#define t1 $f27
|
|
+#define t2 $f28
|
|
+#define t3 $f29
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 16, $26, 0
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ fclr s0
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr s1
|
|
+
|
|
+ fclr s2
|
|
+ addl INCX, INCX, INCX
|
|
+ fclr s3
|
|
+ ble N, $L999
|
|
+
|
|
+ addl INCY, INCY, INCY
|
|
+ fclr t0
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+
|
|
+ srl N, 3, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+
|
|
+ subl I, 1, I
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD s0, t0, s0
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ MUL a1, b0, t2
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a1, b0, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ subl I, 1, I
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a7, b7, t3
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD s0, t0, s0
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a1, b0, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a1, b0, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a1, b1, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a2, b2, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a2, b3, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a3, b2, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a4, b5, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a5, b4, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a5, b5, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a6, b6, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a6, b7, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a7, b6, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L998
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ subl I, 1, I
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ble I, $L28
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ ADD s0, t0, s0
|
|
+ mov X, XX
|
|
+ MUL a0, b0, t0
|
|
+ mov Y, YY
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a0, 0 * SIZE(XX)
|
|
+ MUL a1, b0, t2
|
|
+ LD b0, 0 * SIZE(YY)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ subl I, 1, I
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(XX)
|
|
+
|
|
+ LD b1, 1 * SIZE(YY)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD s0, t0, s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a0, b1, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a1, b0, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a1, b1, t3
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t1, s1
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+#ifndef CONJ
|
|
+ SUB s0, s3, s0
|
|
+ ADD s1, s2, s1
|
|
+#else
|
|
+ ADD s0, s3, s0
|
|
+ SUB s1, s2, s1
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zdot_simd.S b/kernel/sw_64/zdot_simd.S
|
|
new file mode 100644
|
|
index 0000000..ed775e6
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zdot_simd.S
|
|
@@ -0,0 +1,699 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define XX $21
|
|
+#define YY $23
|
|
+
|
|
+#define I $5
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f2
|
|
+#define s3 $f30
|
|
+
|
|
+#define a0 $f10
|
|
+#define a1 $f11
|
|
+#define a2 $f12
|
|
+#define a3 $f13
|
|
+#define a4 $f14
|
|
+#define a5 $f15
|
|
+#define a6 $f16
|
|
+#define a7 $f17
|
|
+
|
|
+#define b0 $f18
|
|
+#define b1 $f19
|
|
+#define b2 $f20
|
|
+#define b3 $f21
|
|
+#define b4 $f22
|
|
+#define b5 $f23
|
|
+#define b6 $f24
|
|
+#define b7 $f25
|
|
+
|
|
+#define t0 $f26
|
|
+#define t1 $f27
|
|
+#define t2 $f28
|
|
+#define t3 $f29
|
|
+
|
|
+#define t4 $f3
|
|
+#define t5 $f4
|
|
+#define t6 $f5
|
|
+#define t7 $f6
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 40, $26, 0
|
|
+
|
|
+ ldi $sp, -40($sp)
|
|
+ fclr s0
|
|
+ fstd $f2, 0($sp)
|
|
+ fclr s1
|
|
+
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+
|
|
+ fclr s2
|
|
+ addl INCX, INCX, INCX
|
|
+ fclr s3
|
|
+ ble N, $L999
|
|
+
|
|
+ addl INCY, INCY, INCY
|
|
+ fclr t0
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+
|
|
+ cmpeq INCX, 2, $21
|
|
+ cmpeq INCY, 2, $22
|
|
+ and $21, $22, $22
|
|
+ beq $22, $Sub
|
|
+
|
|
+/*
|
|
+ test the address of Y & X
|
|
+*/
|
|
+ and Y, (VEC_LEN*SIZE-1), $4
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ or $3, $4, $4
|
|
+ bne $4, $UnAlign_ACCESS
|
|
+
|
|
+/*Align access*/
|
|
+/*UnLoop 8*/
|
|
+ srl N, 3, I
|
|
+ ble I, $Remain
|
|
+ .align 4
|
|
+ vcpys $f31, $f31, s0 #clear s0 vector
|
|
+ vcpys $f31, $f31, s1 #clear s0 vector
|
|
+ vcpys $f31, $f31, s2 #clear s0 vector
|
|
+ vcpys $f31, $f31, s3 #clear s0 vector
|
|
+
|
|
+ vcpys $f31, $f31, t0
|
|
+ vcpys $f31, $f31, t1
|
|
+ vcpys $f31, $f31, t2
|
|
+ vcpys $f31, $f31, t3
|
|
+
|
|
+$MainLoop:
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD b0, 0*VEC_LEN*SIZE(Y)
|
|
+ VADD s0, t0, s0
|
|
+ VLD b1, 1*VEC_LEN*SIZE(Y)
|
|
+ VADD s1, t1, s1
|
|
+
|
|
+ VLD b2, 2*VEC_LEN*SIZE(Y)
|
|
+ VADD s2, t2, s2
|
|
+ VLD b3, 3*VEC_LEN*SIZE(Y)
|
|
+ VADD s3, t3, s3
|
|
+
|
|
+/*spilt the X complex vector to real vector(a0, a2) and image vector (a1, a3)
|
|
+ Y complex vectory to real vector(b0, b2) and image vector (b1, b3)
|
|
+*/
|
|
+ vextf a0, 1, a4
|
|
+ vextf a0, 3, a5
|
|
+ vextf a1, 0, a6
|
|
+ vextf a1, 2, a7
|
|
+
|
|
+ vextf a2, 1, t0
|
|
+ vextf a2, 3, t1
|
|
+ vextf a3, 0, t2
|
|
+ vextf a3, 2, t3
|
|
+
|
|
+ vextf b0, 1, b4
|
|
+ vextf b0, 3, b5
|
|
+ vextf b1, 0, b6
|
|
+ vextf b1, 2, b7
|
|
+
|
|
+ vextf b2, 1, t4
|
|
+ vextf b2, 3, t5
|
|
+ vextf b3, 0, t6
|
|
+ vextf b3, 2, t7
|
|
+
|
|
+ vinsf a4, a1, 0, a1
|
|
+ vinsf a6, a0, 1, a0
|
|
+ vinsf t0, a3, 0, a3
|
|
+ vinsf t2, a2, 1, a2
|
|
+
|
|
+ vinsf b4, b1, 0, b1
|
|
+ addl X, 16 * SIZE, X
|
|
+ vinsf b6, b0, 1, b0
|
|
+ addl Y, 16 * SIZE, Y
|
|
+
|
|
+ vinsf t4, b3, 0, b3
|
|
+ subl I, 1, I
|
|
+ vinsf t6, b2, 1, b2
|
|
+ nop
|
|
+
|
|
+ vinsf a5, a1, 2, a1
|
|
+ vinsf a7, a0, 3, a0
|
|
+ vinsf t1, a3, 2, a3
|
|
+ vinsf t3, a2, 3, a2
|
|
+
|
|
+ vinsf b5, b1, 2, b1
|
|
+ vinsf b7, b0, 3, b0
|
|
+ vinsf t5, b3, 2, b3
|
|
+ vinsf t7, b2, 3, b2
|
|
+
|
|
+ /*Computing*/
|
|
+
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ VMAD a0, b0, s0, s0
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ VMAD a0, b1, s1, s1
|
|
+
|
|
+ VMAD a1, b0, s2, s2
|
|
+ VMAD a1, b1, s3, s3
|
|
+ VMUL a2, b2, t0 /*Just multiply. Add it in next loop.*/
|
|
+ VMUL a2, b3, t1
|
|
+
|
|
+ VMUL a3, b2, t2
|
|
+ VMUL a3, b3, t3
|
|
+ nop
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+$MainLoopEnd:
|
|
+ VADD s0, t0, s0
|
|
+ VADD s1, t1, s1
|
|
+ VADD s2, t2, s2
|
|
+ VADD s3, t3, s3
|
|
+
|
|
+#ifndef CONJ
|
|
+ VSUB s0, s3, s0
|
|
+ VADD s1, s2, s1
|
|
+#else
|
|
+ VADD s0, s3, s0
|
|
+ VSUB s1, s2, s1
|
|
+#endif
|
|
+ vcpys $f31, $f31, s2 #clear s0 vector
|
|
+ vcpys $f31, $f31, s3 #clear s0 vector
|
|
+
|
|
+ vextf s0, 1, t1
|
|
+ vextf s0, 2, t2
|
|
+ vextf s0, 3, t3
|
|
+ vextf s1, 1, t5
|
|
+
|
|
+ vextf s1, 2, t6
|
|
+ vextf s1, 3, t7
|
|
+ ADD s0, t1, s0
|
|
+ ADD t2, t3, t0
|
|
+
|
|
+ ADD s1, t5, s1
|
|
+ ADD t6, t7, t4
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t4, s1
|
|
+$Remain:
|
|
+ and N, 7, I
|
|
+ ble I, $End
|
|
+ .align 4
|
|
+$RemainLoop:
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ subl I, 1, I
|
|
+ SXADDQ INCY, Y, Y
|
|
+ MAD a0, b0, s0, s0
|
|
+
|
|
+ MAD a0, b1, s1, s1
|
|
+ MAD a1, b0, s2, s2
|
|
+ MAD a1, b1, s3, s3
|
|
+ bgt I, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+#ifndef CONJ
|
|
+ SUB s0, s3, s0
|
|
+ ADD s1, s2, s1
|
|
+#else
|
|
+ ADD s0, s3, s0
|
|
+ SUB s1, s2, s1
|
|
+#endif
|
|
+
|
|
+$End:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ ldi $sp, 40($sp)
|
|
+ ret
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS:
|
|
+$Sub:
|
|
+ srl N, 3, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+
|
|
+ subl I, 1, I
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD s0, t0, s0
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ MUL a1, b0, t2
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a1, b0, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ subl I, 1, I
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a7, b7, t3
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD s0, t0, s0
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a1, b0, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ MUL a2, b2, t0
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a2, b3, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a3, b2, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a3, b3, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b2, 0 * SIZE(Y)
|
|
+ MUL a4, b4, t0
|
|
+ LD b3, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a4, b5, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a5, b4, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a5, b5, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b4, 0 * SIZE(Y)
|
|
+ MUL a6, b6, t0
|
|
+ LD b5, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a6, b7, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ unop
|
|
+ MUL a7, b6, t2
|
|
+ unop
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a7, b7, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ LD b6, 0 * SIZE(Y)
|
|
+ MUL a0, b0, t0
|
|
+ LD b7, 1 * SIZE(Y)
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a1, b0, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a1, b1, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a2, b2, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a2, b3, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a3, b2, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a3, b3, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a4, b4, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a4, b5, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a5, b4, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a5, b5, t3
|
|
+
|
|
+ ADD s0, t0, s0
|
|
+ MUL a6, b6, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a6, b7, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a7, b6, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a7, b7, t3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 7, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L998
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ LD b0, 0 * SIZE(Y)
|
|
+ LD b1, 1 * SIZE(Y)
|
|
+
|
|
+ SXADDQ INCX, X, X
|
|
+ subl I, 1, I
|
|
+ SXADDQ INCY, Y, Y
|
|
+ ble I, $L28
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ ADD s0, t0, s0
|
|
+ mov X, XX
|
|
+ MUL a0, b0, t0
|
|
+ mov Y, YY
|
|
+
|
|
+ ADD s1, t1, s1
|
|
+ SXADDQ INCX, X, X
|
|
+ MUL a0, b1, t1
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ LD a0, 0 * SIZE(XX)
|
|
+ MUL a1, b0, t2
|
|
+ LD b0, 0 * SIZE(YY)
|
|
+
|
|
+ ADD s3, t3, s3
|
|
+ subl I, 1, I
|
|
+ MUL a1, b1, t3
|
|
+ LD a1, 1 * SIZE(XX)
|
|
+
|
|
+ LD b1, 1 * SIZE(YY)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD s0, t0, s0
|
|
+ MUL a0, b0, t0
|
|
+ ADD s1, t1, s1
|
|
+ MUL a0, b1, t1
|
|
+
|
|
+ ADD s2, t2, s2
|
|
+ MUL a1, b0, t2
|
|
+ ADD s3, t3, s3
|
|
+ MUL a1, b1, t3
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ ADD s0, t0, s0
|
|
+ ADD s1, t1, s1
|
|
+ ADD s2, t2, s2
|
|
+ ADD s3, t3, s3
|
|
+
|
|
+#ifndef CONJ
|
|
+ SUB s0, s3, s0
|
|
+ ADD s1, s2, s1
|
|
+#else
|
|
+ ADD s0, s3, s0
|
|
+ SUB s1, s2, s1
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+
|
|
+ fldd $f6, 32($sp)
|
|
+ ldi $sp, 40($sp)
|
|
+ ret
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zgemm_beta.S b/kernel/sw_64/zgemm_beta.S
|
|
new file mode 100644
|
|
index 0000000..18f845c
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zgemm_beta.S
|
|
@@ -0,0 +1,192 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+CNAME:
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $28, _mcount
|
|
+ jsr $28, ($28), _mcount
|
|
+ .prologue 1
|
|
+#else
|
|
+ .prologue 0
|
|
+#endif
|
|
+
|
|
+ ldl $18, 24($sp)
|
|
+ ble $16, $End
|
|
+ ldl $19, 32($sp)
|
|
+ ble $17, $End
|
|
+
|
|
+ addl $19, $19, $19
|
|
+ fbne $f19,$Main
|
|
+ fbne $f20,$Main
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ mov $18, $1
|
|
+ ldi $17, -1($17)
|
|
+ SXADDQ $19, $18, $18
|
|
+ mov $16, $2
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ST $f31, 0*SIZE($1)
|
|
+ ST $f31, 1*SIZE($1)
|
|
+ ldi $2, -1($2)
|
|
+ ldi $1, 2*SIZE($1)
|
|
+ bgt $2, $L12
|
|
+ bgt $17,$L13
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+/* Main Routine */
|
|
+$Main:
|
|
+ sra $16, 1, $2 # $2 = (m >> 1)
|
|
+ mov $18, $1 # c_offset = c
|
|
+ ldi $17, -1($17) # n --
|
|
+ SXADDQ $19, $18, $18 # c += ldc
|
|
+ beq $2, $L18
|
|
+
|
|
+ LD $f14, 0*SIZE($1)
|
|
+ LD $f15, 1*SIZE($1)
|
|
+ LD $f24, 2*SIZE($1)
|
|
+ LD $f25, 3*SIZE($1)
|
|
+ ldi $2, -1($2) # $2 --
|
|
+ ble $2, $L19
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L23:
|
|
+ MUL $f19, $f14, $f10
|
|
+ fillcs 9*SIZE($1)
|
|
+ MUL $f20, $f15, $f11
|
|
+ ldi $2, -1($2)
|
|
+
|
|
+ MUL $f19, $f15, $f12
|
|
+ LD $f15, 5*SIZE($1)
|
|
+ MUL $f20, $f14, $f13
|
|
+ LD $f14, 4*SIZE($1)
|
|
+
|
|
+ MUL $f19, $f24, $f16
|
|
+ unop
|
|
+ MUL $f20, $f25, $f17
|
|
+ unop
|
|
+
|
|
+ MUL $f19, $f25, $f18
|
|
+ LD $f25, 7*SIZE($1)
|
|
+ SUB $f10, $f11, $f22
|
|
+ unop
|
|
+
|
|
+ MUL $f20, $f24, $f21
|
|
+ LD $f24, 6*SIZE($1)
|
|
+ ADD $f12, $f13, $f23
|
|
+ ldi $1, 4*SIZE($1)
|
|
+
|
|
+ SUB $f16, $f17, $f26
|
|
+ ADD $f18, $f21, $f27
|
|
+ ST $f22,-4*SIZE($1)
|
|
+ ST $f23,-3*SIZE($1)
|
|
+
|
|
+ ST $f26,-2*SIZE($1)
|
|
+ ST $f27,-1*SIZE($1)
|
|
+ unop
|
|
+ bgt $2,$L23
|
|
+ .align 4
|
|
+
|
|
+$L19:
|
|
+ MUL $f19, $f14, $f10
|
|
+ MUL $f20, $f15, $f11
|
|
+ MUL $f19, $f15, $f12
|
|
+ MUL $f20, $f14, $f13
|
|
+
|
|
+ MUL $f19, $f24, $f16
|
|
+ MUL $f20, $f25, $f17
|
|
+ MUL $f19, $f25, $f18
|
|
+ MUL $f20, $f24, $f21
|
|
+
|
|
+ SUB $f10, $f11, $f22
|
|
+ ADD $f12, $f13, $f23
|
|
+ SUB $f16, $f17, $f26
|
|
+ ADD $f18, $f21, $f27
|
|
+ ldi $1, 4*SIZE($1)
|
|
+
|
|
+ ST $f22, -4*SIZE($1)
|
|
+ ST $f23, -3*SIZE($1)
|
|
+ ST $f26, -2*SIZE($1)
|
|
+ ST $f27, -1*SIZE($1)
|
|
+
|
|
+ blbs $16, $L18
|
|
+ bgt $17, $Main
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ LD $f14, 0*SIZE($1)
|
|
+ LD $f15, 1*SIZE($1)
|
|
+ MUL $f19, $f15, $f13
|
|
+ MUL $f20, $f14, $f10
|
|
+
|
|
+ MUL $f19, $f14, $f12
|
|
+ MUL $f20, $f15, $f11
|
|
+ ADD $f13, $f10, $f26
|
|
+ SUB $f12, $f11, $f27
|
|
+
|
|
+ ST $f26, 1*SIZE($1)
|
|
+ ST $f27, 0*SIZE($1)
|
|
+ ldi $1, 2*SIZE($1)
|
|
+ bgt $17, $Main
|
|
+ .align 4
|
|
+
|
|
+$End:
|
|
+ clr $0
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/zgemm_kernel_2x2.S b/kernel/sw_64/zgemm_kernel_2x2.S
|
|
new file mode 100644
|
|
index 0000000..6cf954b
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zgemm_kernel_2x2.S
|
|
@@ -0,0 +1,1949 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 48
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+ .arch sw6a
|
|
+
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+
|
|
+#define STACKSIZE 88
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define BB $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#define tmp $9
|
|
+
|
|
+#define ALPHA_R 64($sp)
|
|
+#define ALPHA_I 72($sp)
|
|
+
|
|
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 SUB
|
|
+#define ADD4 SUB
|
|
+#endif
|
|
+
|
|
+CNAME:
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $at, _mcount
|
|
+ jsr $at, ($at), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+#ifdef TRMMKERNEL
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+#endif
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ fstd $f19, ALPHA_R
|
|
+ fstd $f20, ALPHA_I
|
|
+ stl tmp, 80($sp)
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ subl $31, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 1, J
|
|
+ ble J, $L30
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ mov A, AO
|
|
+ s4addl K, 0, BB
|
|
+
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ SXADDQ BB, B, BB
|
|
+ addl C2, LDC, C
|
|
+ unop
|
|
+
|
|
+ sra M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#ifndef EV4
|
|
+ fillcs 0 * SIZE(BB)
|
|
+ fillcs 8 * SIZE(BB)
|
|
+ unop
|
|
+ ldi BB, 16 * SIZE(BB)
|
|
+#endif
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble L, $L15
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble L, $L15
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD1 c11, t1, a6
|
|
+ fmov a6, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c12, t2, a6
|
|
+ fmov a6, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c16, t3, a6
|
|
+ fmov a6, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD4 c15, t4, a6
|
|
+ fmov a6, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD2 c06, t3, a6
|
|
+ fmov a6, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD1 c03, t1, a6
|
|
+ fmov a6, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, a6
|
|
+ fmov a6, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, a6
|
|
+ fmov a6, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, a6
|
|
+ fmov a6, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+ FIMOVD a6, tmp
|
|
+
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, a6
|
|
+ fmov a6, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, a6
|
|
+ fmov a6, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD1 c11, t1, a6
|
|
+ fmov a6, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c12, t2, a6
|
|
+ fmov a6, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD2 c16, t3, a6
|
|
+ fmov a6, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c15, t4, a6
|
|
+ fmov a6, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ IFMOVD tmp, a6
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t3, a6
|
|
+ fmov a6, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD1 c03, t1, a6
|
|
+ fmov a6, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, a6
|
|
+ fmov a6, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, a6
|
|
+ fmov a6, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, a6
|
|
+ fmov a6, c13
|
|
+ unop
|
|
+ IFMOVD tmp, a6
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ unop
|
|
+ IFMOVD tmp, a6
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, a6
|
|
+ fmov a6, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, a6
|
|
+ fmov a6, c07
|
|
+ IFMOVD tmp, a6
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD1 c11, t1, a6
|
|
+ fmov a6, c11
|
|
+ fldd alpha_r, ALPHA_R
|
|
+ FIMOVD alpha_r, tmp
|
|
+ MUL b1, a1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L18
|
|
+#else
|
|
+ blbs TMP1, $L18
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c12, t2, a6
|
|
+ fmov a6, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, a6
|
|
+ fmov a6, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, a6
|
|
+ fmov a6, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t3, a6
|
|
+ fmov a6, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, a6
|
|
+ fmov a6, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, a6
|
|
+ fmov a6, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t2, a6
|
|
+ fmov a6, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, a6
|
|
+ fmov a6, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, a6
|
|
+ fmov a6, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, a6
|
|
+ fmov a6, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, a6
|
|
+ fmov a6, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD1 c11, t1, a6
|
|
+ fmov a6, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ADD3 c12, t2, a6
|
|
+ fmov a6, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ fldd alpha_i, ALPHA_I
|
|
+
|
|
+ ADD2 c16, t3, a6
|
|
+ fmov a6, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD4 c15, t4, a6
|
|
+ fmov a6, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b1, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c06, t3, a6
|
|
+ fmov a6, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, a6
|
|
+ fmov a6, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, a6
|
|
+ fmov a6, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a1, 2 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c04, t2, a6
|
|
+ fmov a6, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, a6
|
|
+ fmov a6, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a2, 3 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD4 c13, t4, a6
|
|
+ fmov a6, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b2, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ ldi I, -1(I)
|
|
+ MUL b3, a3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b3, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c14, t3, a6
|
|
+ fmov a6, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a4, 2 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD4 c07, t4, a6
|
|
+ fmov a6, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 3 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c11, t1, a6
|
|
+ fmov a6, c11
|
|
+ ADD3 c12, t2, a6
|
|
+ fmov a6, c12
|
|
+ ADD2 c16, t3, a6
|
|
+ fmov a6, c16
|
|
+ ADD4 c15, t4, a6
|
|
+ fmov a6, c15
|
|
+
|
|
+ ADD c01, c06, a6
|
|
+ fmov a6, c01
|
|
+ ADD c02, c05, a6
|
|
+ fmov a6, c02
|
|
+ ADD c03, c08, a6
|
|
+ fmov a6, c03
|
|
+ ADD c04, c07, a6
|
|
+ fmov a6, c04
|
|
+
|
|
+ ADD c09, c14, a6
|
|
+ fmov a6, c09
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c01, t1
|
|
+ ADD c10, c13, a6
|
|
+ fmov a6, c10
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c02, t2
|
|
+
|
|
+ ADD c11, c16, a6
|
|
+ fmov a6, c11
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c03, t3
|
|
+ ADD c12, c15, a6
|
|
+ fmov a6, c12
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c04, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD a5, t1, a6
|
|
+ fmov a6, a5
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD b1, t2, a6
|
|
+ fmov a6, b1
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD a1, t3, a6
|
|
+ fmov a6, a1
|
|
+ MUL alpha_i, c04, t3
|
|
+ ADD a2, t4, a6
|
|
+ fmov a6, a2
|
|
+ MUL alpha_i, c03, t4
|
|
+#else
|
|
+ ADD $f31, t1, a5
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD $f31, t2, b1
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD $f31, t3, a1
|
|
+ MUL alpha_i, c04, t3
|
|
+ ADD $f31, t4, a2
|
|
+ MUL alpha_i, c03, t4
|
|
+#endif
|
|
+
|
|
+ SUB a5, t1, a6
|
|
+ fmov a6, a5
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c09, t1
|
|
+ ADD b1, t2, a6
|
|
+ fmov a6, b1
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c10, t2
|
|
+
|
|
+ SUB a1, t3, a6
|
|
+ fmov a6, a1
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c11, t3
|
|
+ ADD a2, t4, a6
|
|
+ fmov a6, a2
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c12, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD b2, t1, a6
|
|
+ fmov a6, b2
|
|
+ MUL alpha_i, c10, t1
|
|
+ ADD b3, t2, a6
|
|
+ fmov a6, b3
|
|
+ MUL alpha_i, c09, t2
|
|
+
|
|
+ ADD a4, t3, a6
|
|
+ fmov a6, a4
|
|
+ MUL alpha_i, c12, t3
|
|
+ ADD a3, t4, a6
|
|
+ fmov a6, a3
|
|
+ MUL alpha_i, c11, t4
|
|
+#else
|
|
+ ADD $f31, t1, b2
|
|
+ MUL alpha_i, c10, t1
|
|
+ ADD $f31, t2, b3
|
|
+ MUL alpha_i, c09, t2
|
|
+
|
|
+ ADD $f31, t3, a4
|
|
+ MUL alpha_i, c12, t3
|
|
+ ADD $f31, t4, a3
|
|
+ MUL alpha_i, c11, t4
|
|
+#endif
|
|
+
|
|
+ SUB b2, t1, a6
|
|
+ fmov a6, b2
|
|
+ ST a5, 0 * SIZE(C1)
|
|
+ fclr t1
|
|
+ unop
|
|
+
|
|
+ ADD b3, t2, a6
|
|
+ fmov a6, b3
|
|
+ ST b1, 1 * SIZE(C1)
|
|
+ fclr t2
|
|
+ unop
|
|
+
|
|
+ SUB a4, t3, a6
|
|
+ fmov a6, a4
|
|
+ ST a1, 2 * SIZE(C1)
|
|
+ fclr t3
|
|
+ unop
|
|
+
|
|
+ ADD a3, t4, a6
|
|
+ fmov a6, a3
|
|
+ ST a2, 3 * SIZE(C1)
|
|
+ fclr t4
|
|
+ unop
|
|
+
|
|
+ ST b2, 0 * SIZE(C2)
|
|
+ fclr c01
|
|
+ ST b3, 1 * SIZE(C2)
|
|
+ fclr c05
|
|
+
|
|
+ ST a4, 2 * SIZE(C2)
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ST a3, 3 * SIZE(C2)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 1, I
|
|
+ ble I, $L29
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ ble L, $L25
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, a6
|
|
+ fmov a6, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t4, a6
|
|
+ fmov a6, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, a6
|
|
+ fmov a6, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD2 c14, t4, a6
|
|
+ fmov a6, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ fldd alpha_r, ALPHA_R
|
|
+ FIMOVD alpha_r, tmp
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L28
|
|
+#else
|
|
+ blbs TMP1, $L28
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, a6
|
|
+ fmov a6, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c14, t4, a6
|
|
+ fmov a6, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ fldd alpha_i, ALPHA_I
|
|
+
|
|
+ ADD4 c13, t3, a6
|
|
+ fmov a6, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c03, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c14, t4, a6
|
|
+ fmov a6, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c04, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c11, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c12, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ MUL a1, b4, t3
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+ MUL a2, b4, t4
|
|
+
|
|
+ ADD1 c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ ADD3 c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ ADD4 c13, t3, a6
|
|
+ fmov a6, c13
|
|
+ ADD2 c14, t4, a6
|
|
+ fmov a6, c14
|
|
+
|
|
+ ADD c01, c06, a6
|
|
+ fmov a6, c01
|
|
+ ADD c02, c05, a6
|
|
+ fmov a6, c02
|
|
+ ADD c09, c14, a6
|
|
+ fmov a6, c09
|
|
+ ADD c10, c13, a6
|
|
+ fmov a6, c10
|
|
+
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c01, t1
|
|
+ MUL alpha_r, c02, t2
|
|
+ MUL alpha_r, c09, t3
|
|
+ MUL alpha_r, c10, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c03, t1, a6
|
|
+ fmov a6, c03
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD c04, t2, a6
|
|
+ fmov a6, c04
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD c11, t3, a6
|
|
+ fmov a6, c11
|
|
+ MUL alpha_i, c10, t3
|
|
+ ADD c12, t4, a6
|
|
+ fmov a6, c12
|
|
+ MUL alpha_i, c09, t4
|
|
+#else
|
|
+ ADD $f31, t1, c03
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD $f31, t2, c04
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD $f31, t3, c11
|
|
+ MUL alpha_i, c10, t3
|
|
+ ADD $f31, t4, c12
|
|
+ MUL alpha_i, c09, t4
|
|
+#endif
|
|
+
|
|
+ SUB c03, t1, a6
|
|
+ fmov a6, c03
|
|
+ ADD c04, t2, a6
|
|
+ fmov a6, c04
|
|
+ SUB c11, t3, a6
|
|
+ fmov a6, c11
|
|
+ ADD c12, t4, a6
|
|
+ fmov a6, c12
|
|
+
|
|
+ ST c03, 0 * SIZE(C1)
|
|
+ ST c04, 1 * SIZE(C1)
|
|
+ ST c11, 0 * SIZE(C2)
|
|
+ ST c12, 1 * SIZE(C2)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 1, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L29:
|
|
+ mov BO, B
|
|
+ ldi J, -1(J)
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+ mov C, C1
|
|
+ mov A, AO
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ ble I, $L50
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+ ble L, $L45
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+ ble L, $L45
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L42:
|
|
+ ADD4 c05, t1, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t2, a6
|
|
+ fmov a6, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, a6
|
|
+ fmov a6, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t4, a6
|
|
+ fmov a6, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, a6
|
|
+ fmov a6, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, a6
|
|
+ fmov a6, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t2, a6
|
|
+ fmov a6, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, a6
|
|
+ fmov a6, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD2 c08, t4, a6
|
|
+ fmov a6, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, a6
|
|
+ fmov a6, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, a6
|
|
+ fmov a6, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L42
|
|
+ .align 4
|
|
+
|
|
+$L45:
|
|
+ ADD4 c05, t1, a6
|
|
+ fmov a6, c05
|
|
+ fldd alpha_r, ALPHA_R
|
|
+ FIMOVD alpha_r, tmp
|
|
+ MUL b1, a1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L48
|
|
+#else
|
|
+ blbs TMP1, $L48
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD2 c06, t2, a6
|
|
+ fmov a6, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, a6
|
|
+ fmov a6, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, a6
|
|
+ fmov a6, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, a6
|
|
+ fmov a6, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, a6
|
|
+ fmov a6, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, a6
|
|
+ fmov a6, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L48:
|
|
+ ADD2 c06, t2, a6
|
|
+ fmov a6, c06
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ fldd alpha_i, ALPHA_I
|
|
+
|
|
+ ADD4 c07, t3, a6
|
|
+ fmov a6, c07
|
|
+ ldi I, -1(I)
|
|
+ MUL a3, b1, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c09, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c08, t4, a6
|
|
+ fmov a6, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c10, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c11, 2 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c12, 3 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c03, t3, a6
|
|
+ fmov a6, c03
|
|
+ MUL a3, b2, t3
|
|
+ ADD3 c04, t4, a6
|
|
+ fmov a6, c04
|
|
+ MUL a4, b2, t4
|
|
+
|
|
+ ADD4 c05, t1, a6
|
|
+ fmov a6, c05
|
|
+ ADD2 c06, t2, a6
|
|
+ fmov a6, c06
|
|
+ ADD4 c07, t3, a6
|
|
+ fmov a6, c07
|
|
+ ADD2 c08, t4, a6
|
|
+ fmov a6, c08
|
|
+
|
|
+ ADD c01, c06, a6
|
|
+ fmov a6, c01
|
|
+ ADD c02, c05, a6
|
|
+ fmov a6, c02
|
|
+ ADD c03, c08, a6
|
|
+ fmov a6, c03
|
|
+ ADD c04, c07, a6
|
|
+ fmov a6, c04
|
|
+
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c01, t1
|
|
+ MUL alpha_r, c02, t2
|
|
+ MUL alpha_r, c03, t3
|
|
+ MUL alpha_r, c04, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD c11, t3, a6
|
|
+ fmov a6, c11
|
|
+ MUL alpha_i, c04, t3
|
|
+ ADD c12, t4, a6
|
|
+ fmov a6, c12
|
|
+ MUL alpha_i, c03, t4
|
|
+#else
|
|
+ ADD $f31, t1, c09
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD $f31, t2, c10
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD $f31, t3, c11
|
|
+ MUL alpha_i, c04, t3
|
|
+ ADD $f31, t4, c12
|
|
+ MUL alpha_i, c03, t4
|
|
+#endif
|
|
+
|
|
+ SUB c09, t1, a6
|
|
+ fmov a6, c09
|
|
+ ADD c10, t2, a6
|
|
+ fmov a6, c10
|
|
+ SUB c11, t3, a6
|
|
+ fmov a6, c11
|
|
+ ADD c12, t4, a6
|
|
+ fmov a6, c12
|
|
+
|
|
+ ST c09, 0 * SIZE(C1)
|
|
+ ST c10, 1 * SIZE(C1)
|
|
+ ST c11, 2 * SIZE(C1)
|
|
+ ST c12, 3 * SIZE(C1)
|
|
+
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ bgt I, $L41
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ and M, 1, I
|
|
+ ble I, $L999
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ ble L, $L55
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L52:
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ fldd alpha_r, ALPHA_R
|
|
+ FIMOVD alpha_r, tmp
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L58
|
|
+#else
|
|
+ blbs TMP1, $L58
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ fldd alpha_i, ALPHA_I
|
|
+
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c03, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c04, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c01, t1, a6
|
|
+ fmov a6, c01
|
|
+ ADD3 c02, t2, a6
|
|
+ fmov a6, c02
|
|
+ ADD4 c05, t3, a6
|
|
+ fmov a6, c05
|
|
+ ADD2 c06, t4, a6
|
|
+ fmov a6, c06
|
|
+
|
|
+ ADD c01, c06, a6
|
|
+ fmov a6, c01
|
|
+ ADD c02, c05, a6
|
|
+ fmov a6, c02
|
|
+
|
|
+ IFMOVD tmp, alpha_r
|
|
+ MUL alpha_r, c01, t1
|
|
+ MUL alpha_r, c02, t2
|
|
+ MUL alpha_i, c02, t3
|
|
+ MUL alpha_i, c01, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c03, t1, a6
|
|
+ fmov a6, c03
|
|
+ ADD c04, t2, a6
|
|
+ fmov a6, c04
|
|
+#else
|
|
+ ADD $f31, t1, c03
|
|
+ ADD $f31, t2, c04
|
|
+#endif
|
|
+
|
|
+ SUB c03, t3, a6
|
|
+ fmov a6, c03
|
|
+ ADD c04, t4, a6
|
|
+ fmov a6, c04
|
|
+
|
|
+ ST c03, 0 * SIZE(C1)
|
|
+ ST c04, 1 * SIZE(C1)
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl $9, 80($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/zgemm_kernel_2x2.S.bak b/kernel/sw_64/zgemm_kernel_2x2.S.bak
|
|
new file mode 100644
|
|
index 0000000..2133673
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zgemm_kernel_2x2.S.bak
|
|
@@ -0,0 +1,1704 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW2B
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+ .arch ev6
|
|
+
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define BB $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#define ALPHA_R 64($sp)
|
|
+#define ALPHA_I 72($sp)
|
|
+
|
|
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 SUB
|
|
+#define ADD4 SUB
|
|
+#endif
|
|
+
|
|
+CNAME:
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $at, _mcount
|
|
+ jsr $at, ($at), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+#ifdef TRMMKERNEL
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+#endif
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ fstd $f19, ALPHA_R
|
|
+ fstd $f20, ALPHA_I
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ subl $31, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 1, J
|
|
+ ble J, $L30
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ mov A, AO
|
|
+ s4addl K, 0, BB
|
|
+
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ SXADDQ BB, B, BB
|
|
+ addl C2, LDC, C
|
|
+ unop
|
|
+
|
|
+ sra M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#ifndef EV4
|
|
+ fillcs 0 * SIZE(BB)
|
|
+ fillcs 8 * SIZE(BB)
|
|
+ unop
|
|
+ ldi BB, 16 * SIZE(BB)
|
|
+#endif
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble L, $L15
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble L, $L15
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD1 c11, t1, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD1 c01, t1, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD1 c11, t1, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD1 c03, t1, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD1 c11, t1, c11
|
|
+ fldd alpha_r, ALPHA_R
|
|
+ MUL b1, a1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L18
|
|
+#else
|
|
+ blbs TMP1, $L18
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD1 c11, t1, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ADD3 c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ fldd alpha_i, ALPHA_I
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a5, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b1, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a1, 2 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a2, 3 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b2, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ ldi I, -1(I)
|
|
+ MUL b3, a3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD b3, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a4, 2 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD a3, 3 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c11, t1, c11
|
|
+ ADD3 c12, t2, c12
|
|
+ ADD2 c16, t3, c16
|
|
+ ADD4 c15, t4, c15
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c03, c08, c03
|
|
+ ADD c04, c07, c04
|
|
+
|
|
+ ADD c09, c14, c09
|
|
+ MUL alpha_r, c01, t1
|
|
+ ADD c10, c13, c10
|
|
+ MUL alpha_r, c02, t2
|
|
+
|
|
+ ADD c11, c16, c11
|
|
+ MUL alpha_r, c03, t3
|
|
+ ADD c12, c15, c12
|
|
+ MUL alpha_r, c04, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD a5, t1, a5
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD b1, t2, b1
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD a1, t3, a1
|
|
+ MUL alpha_i, c04, t3
|
|
+ ADD a2, t4, a2
|
|
+ MUL alpha_i, c03, t4
|
|
+#else
|
|
+ ADD $f31, t1, a5
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD $f31, t2, b1
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD $f31, t3, a1
|
|
+ MUL alpha_i, c04, t3
|
|
+ ADD $f31, t4, a2
|
|
+ MUL alpha_i, c03, t4
|
|
+#endif
|
|
+
|
|
+ SUB a5, t1, a5
|
|
+ MUL alpha_r, c09, t1
|
|
+ ADD b1, t2, b1
|
|
+ MUL alpha_r, c10, t2
|
|
+
|
|
+ SUB a1, t3, a1
|
|
+ MUL alpha_r, c11, t3
|
|
+ ADD a2, t4, a2
|
|
+ MUL alpha_r, c12, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD b2, t1, b2
|
|
+ MUL alpha_i, c10, t1
|
|
+ ADD b3, t2, b3
|
|
+ MUL alpha_i, c09, t2
|
|
+
|
|
+ ADD a4, t3, a4
|
|
+ MUL alpha_i, c12, t3
|
|
+ ADD a3, t4, a3
|
|
+ MUL alpha_i, c11, t4
|
|
+#else
|
|
+ ADD $f31, t1, b2
|
|
+ MUL alpha_i, c10, t1
|
|
+ ADD $f31, t2, b3
|
|
+ MUL alpha_i, c09, t2
|
|
+
|
|
+ ADD $f31, t3, a4
|
|
+ MUL alpha_i, c12, t3
|
|
+ ADD $f31, t4, a3
|
|
+ MUL alpha_i, c11, t4
|
|
+#endif
|
|
+
|
|
+ SUB b2, t1, b2
|
|
+ ST a5, 0 * SIZE(C1)
|
|
+ fclr t1
|
|
+ unop
|
|
+
|
|
+ ADD b3, t2, b3
|
|
+ ST b1, 1 * SIZE(C1)
|
|
+ fclr t2
|
|
+ unop
|
|
+
|
|
+ SUB a4, t3, a4
|
|
+ ST a1, 2 * SIZE(C1)
|
|
+ fclr t3
|
|
+ unop
|
|
+
|
|
+ ADD a3, t4, a3
|
|
+ ST a2, 3 * SIZE(C1)
|
|
+ fclr t4
|
|
+ unop
|
|
+
|
|
+ ST b2, 0 * SIZE(C2)
|
|
+ fclr c01
|
|
+ ST b3, 1 * SIZE(C2)
|
|
+ fclr c05
|
|
+
|
|
+ ST a4, 2 * SIZE(C2)
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ST a3, 3 * SIZE(C2)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 1, I
|
|
+ ble I, $L29
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 2, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ ble L, $L25
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD1 c09, t1, c09
|
|
+ fldd alpha_r, ALPHA_R
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L28
|
|
+#else
|
|
+ blbs TMP1, $L28
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ fldd alpha_i, ALPHA_I
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c03, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c04, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c11, 0 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c12, 1 * SIZE(C2)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ MUL a1, b4, t3
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ ADD3 c10, t2, c10
|
|
+ ADD4 c13, t3, c13
|
|
+ ADD2 c14, t4, c14
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c09, c14, c09
|
|
+ ADD c10, c13, c10
|
|
+
|
|
+ MUL alpha_r, c01, t1
|
|
+ MUL alpha_r, c02, t2
|
|
+ MUL alpha_r, c09, t3
|
|
+ MUL alpha_r, c10, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c03, t1, c03
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD c04, t2, c04
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD c11, t3, c11
|
|
+ MUL alpha_i, c10, t3
|
|
+ ADD c12, t4, c12
|
|
+ MUL alpha_i, c09, t4
|
|
+#else
|
|
+ ADD $f31, t1, c03
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD $f31, t2, c04
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD $f31, t3, c11
|
|
+ MUL alpha_i, c10, t3
|
|
+ ADD $f31, t4, c12
|
|
+ MUL alpha_i, c09, t4
|
|
+#endif
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ ADD c04, t2, c04
|
|
+ SUB c11, t3, c11
|
|
+ ADD c12, t4, c12
|
|
+
|
|
+ ST c03, 0 * SIZE(C1)
|
|
+ ST c04, 1 * SIZE(C1)
|
|
+ ST c11, 0 * SIZE(C2)
|
|
+ ST c12, 1 * SIZE(C2)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 1, TMP1
|
|
+#else
|
|
+ subl TMP1, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L29:
|
|
+ mov BO, B
|
|
+ ldi J, -1(J)
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+ mov C, C1
|
|
+ mov A, AO
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ ble I, $L50
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+ ble L, $L45
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+ ble L, $L45
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L42:
|
|
+ ADD4 c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L42
|
|
+ .align 4
|
|
+
|
|
+$L45:
|
|
+ ADD4 c05, t1, c05
|
|
+ fldd alpha_r, ALPHA_R
|
|
+ MUL b1, a1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L48
|
|
+#else
|
|
+ blbs TMP1, $L48
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L48:
|
|
+ ADD2 c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ fldd alpha_i, ALPHA_I
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ ldi I, -1(I)
|
|
+ MUL a3, b1, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c09, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c10, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c11, 2 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c12, 3 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ MUL a3, b2, t3
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ ADD2 c06, t2, c06
|
|
+ ADD4 c07, t3, c07
|
|
+ ADD2 c08, t4, c08
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c03, c08, c03
|
|
+ ADD c04, c07, c04
|
|
+
|
|
+ MUL alpha_r, c01, t1
|
|
+ MUL alpha_r, c02, t2
|
|
+ MUL alpha_r, c03, t3
|
|
+ MUL alpha_r, c04, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c09, t1, c09
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD c10, t2, c10
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD c11, t3, c11
|
|
+ MUL alpha_i, c04, t3
|
|
+ ADD c12, t4, c12
|
|
+ MUL alpha_i, c03, t4
|
|
+#else
|
|
+ ADD $f31, t1, c09
|
|
+ MUL alpha_i, c02, t1
|
|
+ ADD $f31, t2, c10
|
|
+ MUL alpha_i, c01, t2
|
|
+
|
|
+ ADD $f31, t3, c11
|
|
+ MUL alpha_i, c04, t3
|
|
+ ADD $f31, t4, c12
|
|
+ MUL alpha_i, c03, t4
|
|
+#endif
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ ADD c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ ADD c12, t4, c12
|
|
+
|
|
+ ST c09, 0 * SIZE(C1)
|
|
+ ST c10, 1 * SIZE(C1)
|
|
+ ST c11, 2 * SIZE(C1)
|
|
+ ST c12, 3 * SIZE(C1)
|
|
+
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+
|
|
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TMP1
|
|
+#ifdef LEFT
|
|
+ subl TMP1, 2, TMP1
|
|
+#else
|
|
+ subl TMP1, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ bgt I, $L41
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ and M, 1, I
|
|
+ ble I, $L999
|
|
+
|
|
+#if !defined(TRMMKERNEL) || \
|
|
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
|
|
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
|
|
+
|
|
+#ifdef TRMMKERNEL
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, TMP1
|
|
+#else
|
|
+ addl KK, 1, TMP1
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ldi L, -2(K)
|
|
+#else
|
|
+ ldi L, -2(TMP1)
|
|
+#endif
|
|
+ ble L, $L55
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L52:
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD1 c01, t1, c01
|
|
+ fldd alpha_r, ALPHA_R
|
|
+ MUL a1, b1, t1
|
|
+#ifndef TRMMKERNEL
|
|
+ blbs K, $L58
|
|
+#else
|
|
+ blbs TMP1, $L58
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L58:
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ fldd alpha_i, ALPHA_I
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c03, 0 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+#ifndef TRMMKERNEL
|
|
+ LD c04, 1 * SIZE(C1)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ ADD3 c02, t2, c02
|
|
+ ADD4 c05, t3, c05
|
|
+ ADD2 c06, t4, c06
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+
|
|
+ MUL alpha_r, c01, t1
|
|
+ MUL alpha_r, c02, t2
|
|
+ MUL alpha_i, c02, t3
|
|
+ MUL alpha_i, c01, t4
|
|
+
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD c03, t1, c03
|
|
+ ADD c04, t2, c04
|
|
+#else
|
|
+ ADD $f31, t1, c03
|
|
+ ADD $f31, t2, c04
|
|
+#endif
|
|
+
|
|
+ SUB c03, t3, c03
|
|
+ ADD c04, t4, c04
|
|
+
|
|
+ ST c03, 0 * SIZE(C1)
|
|
+ ST c04, 1 * SIZE(C1)
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/zgemm_kernel_simd_8x2.S b/kernel/sw_64/zgemm_kernel_simd_8x2.S
|
|
new file mode 100644
|
|
index 0000000..f6a36fb
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zgemm_kernel_simd_8x2.S
|
|
@@ -0,0 +1,3189 @@
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#define STACKSIZE 128
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define PREA $10
|
|
+#define PREB $11
|
|
+
|
|
+#define AO $9
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define a5 $f16
|
|
+#define a6 $f24
|
|
+#define a7 $f25
|
|
+#define a8 $f26
|
|
+
|
|
+#define b5 $f27
|
|
+#define b6 $f28
|
|
+#define b7 $f29
|
|
+#define b8 $f30
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TEMP $1
|
|
+#define KK $2
|
|
+#define BB $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#define ALPHA_R 64($sp)
|
|
+#define ALPHA_I 72($sp)
|
|
+
|
|
+/*
|
|
+ *===================
|
|
+ * (a+bi)*(c+di)
|
|
+ * ADD1 ac '+' bd
|
|
+ * ADD2 ad '+' bc
|
|
+ * FMAD5 a*alpha_r + real part
|
|
+ * FMAD6 a*alpha_i + image part
|
|
+ * FMAD7 b*alpha_r + image part
|
|
+ * FMAD8 b*alpha_i + real part
|
|
+
|
|
+ *===================
|
|
+ */
|
|
+
|
|
+/*
|
|
+ *===================
|
|
+ * (a+bi) * (c+di)
|
|
+ * (a+bi) * (alpha_r+alpha_i)
|
|
+ *===================
|
|
+ */
|
|
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
+#define ADD1 SUB
|
|
+#define ADD2 ADD
|
|
+#define FMAD5 MAD
|
|
+#define FMAD6 MAD
|
|
+#define FMAD7 MAD
|
|
+#define FMAD8 NMAD
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ *===================
|
|
+ * (a-bi) * (c+di)
|
|
+ * (a+bi) * (alpha_r+alpha_i)
|
|
+ *===================
|
|
+ */
|
|
+
|
|
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define FMAD5 MAD
|
|
+#define FMAD6 MAD
|
|
+#define FMAD7 MAD
|
|
+#define FMAD8 NMAD
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ *===================
|
|
+ * (a+bi) * (c-di)
|
|
+ * (a-bi) * (alpha_r+alpha_i)
|
|
+ *===================
|
|
+ */
|
|
+
|
|
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define FMAD5 MAD
|
|
+#define FMAD6 MAD
|
|
+#define FMAD7 NMAD
|
|
+#define FMAD8 MAD
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ *===================
|
|
+ * (a-bi) * (c-di)
|
|
+ * (a-bi) * (alpha_r+alpha_i)
|
|
+ *===================
|
|
+ */
|
|
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
|
+#define ADD1 SUB
|
|
+#define ADD2 ADD
|
|
+#define FMAD5 MAD
|
|
+#define FMAD6 MAD
|
|
+#define FMAD7 NMAD
|
|
+#define FMAD8 MAD
|
|
+#endif
|
|
+
|
|
+
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ .frame $30, STACKSIZE, $26, 0
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+#ifdef TRMMKERNEL
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+#endif
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC # LDC*sizebyte
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ ST $f19, ALPHA_R
|
|
+ ST $f20, ALPHA_I
|
|
+
|
|
+ stl $9, 80($sp) # Integer Saved Register
|
|
+ stl $10,88($sp)
|
|
+ stl $11,96($sp)
|
|
+ stl $12,104($sp)
|
|
+ stl $13,112($sp)
|
|
+ stl $14,120($sp)
|
|
+
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ subl $31, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 1, J # J=N/2
|
|
+ ble J, $L50
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra M, 3, I # I=M/8
|
|
+ sll K, ZBASE_SHIFT, PREB
|
|
+
|
|
+ sll K, 2+ZBASE_SHIFT, PREA
|
|
+ mov C, C1
|
|
+
|
|
+ addl C, LDC, C2
|
|
+ mov A, AO # Reset A
|
|
+
|
|
+ addl PREB, B, PREB
|
|
+ addl C2, LDC, C # Change C to next panel
|
|
+
|
|
+ addl PREA, A, PREA
|
|
+ beq I, $L20 # GEMM_MR=8
|
|
+
|
|
+$L11:
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B, BO # LL && RU reset B
|
|
+ nop
|
|
+#else
|
|
+ sll KK, 3 + ZBASE_SHIFT, L # KK*8mr
|
|
+ sll KK, 1 + ZBASE_SHIFT, TEMP # KK*2nr
|
|
+
|
|
+ addl AO, L, AO # mov AO point to the data part
|
|
+ addl B,TEMP,BO # mov BO point to the data part
|
|
+#endif
|
|
+
|
|
+ vcpys $f31,$f31,c01 # Clear result regs
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c02
|
|
+ fillcs 8*SIZE(C1)
|
|
+ fillcs 12*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c03
|
|
+ fillcs 0(C2)
|
|
+ fillcs 4*SIZE(C2)
|
|
+
|
|
+ vcpys $f31,$f31,c04
|
|
+ fillcs 8*SIZE(C2)
|
|
+ fillcs 12*SIZE(C2)
|
|
+
|
|
+ vcpys $f31,$f31,c05
|
|
+ vcpys $f31,$f31,c06
|
|
+ vcpys $f31,$f31,c07
|
|
+ vcpys $f31,$f31,c08
|
|
+
|
|
+ vcpys $f31,$f31,c09
|
|
+ LDDE b1, 0 * SIZE(BO) # B1R
|
|
+ LDDE b2, 1 * SIZE(BO) # B1I
|
|
+
|
|
+ vcpys $f31,$f31,c10
|
|
+ VLD a1, 0 * SIZE(AO) # A1, A2
|
|
+ VLD a2, 4 * SIZE(AO) # A3, A4
|
|
+
|
|
+ vcpys $f31,$f31,c11
|
|
+ LDDE b3, 2 * SIZE(BO) # B2R
|
|
+ LDDE b4, 3 * SIZE(BO) # B2I
|
|
+
|
|
+ vcpys $f31,$f31,c12
|
|
+ VLD a3, 8 * SIZE(AO) # A5, A6
|
|
+ VLD a4,12 * SIZE(AO) # A7, A8
|
|
+
|
|
+ vcpys $f31,$f31,c13
|
|
+ vcpys $f31,$f31,c14
|
|
+ vcpys $f31,$f31,c15
|
|
+ vcpys $f31,$f31,c16
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) \
|
|
+ ||(!defined(LEFT) && defined(TRANSA))
|
|
+ subl K, KK, TEMP # temp is the length of data part
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 8, TEMP # mr=8, careful about complex
|
|
+#else
|
|
+ addl KK, 2, TEMP # nr=2
|
|
+#endif
|
|
+ sra TEMP, 1, L # L=TEMP/2
|
|
+ ble L, $L15
|
|
+
|
|
+#else
|
|
+ vcpys $f31,$f31,c01 # Clear result regs
|
|
+ mov B, BO # Set B, (block A x panel Bj)
|
|
+ sra K, 1, L # Unroll K as 2
|
|
+
|
|
+ vcpys $f31,$f31,c02
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c03
|
|
+ fillcs 8*SIZE(C1)
|
|
+ fillcs 12*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c04
|
|
+ fillcs 0(C2)
|
|
+ fillcs 4*SIZE(C2)
|
|
+
|
|
+ vcpys $f31,$f31,c05
|
|
+ fillcs 8*SIZE(C2)
|
|
+ fillcs 12*SIZE(C2)
|
|
+
|
|
+ vcpys $f31,$f31,c06
|
|
+ vcpys $f31,$f31,c07
|
|
+ vcpys $f31,$f31,c08
|
|
+ vcpys $f31,$f31,c09
|
|
+
|
|
+ vcpys $f31,$f31,c10
|
|
+ LDDE b1, 0 * SIZE(BO) # B1R
|
|
+ LDDE b2, 1 * SIZE(BO) # B1I
|
|
+
|
|
+ vcpys $f31,$f31,c11
|
|
+ VLD a1, 0 * SIZE(AO) # A1, A2
|
|
+ VLD a2, 4 * SIZE(AO) # A3, A4
|
|
+
|
|
+ vcpys $f31,$f31,c12
|
|
+ LDDE b3, 2 * SIZE(BO) # B2R
|
|
+ LDDE b4, 3 * SIZE(BO) # B2I
|
|
+
|
|
+ vcpys $f31,$f31,c13
|
|
+ VLD a3, 8 * SIZE(AO) # A5, A6
|
|
+ VLD a4,12 * SIZE(AO) # A7, A8
|
|
+
|
|
+ vcpys $f31,$f31,c14
|
|
+ vcpys $f31,$f31,c15
|
|
+
|
|
+ vcpys $f31,$f31,c16
|
|
+ ble L, $L15
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$L12:
|
|
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
|
|
+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc)
|
|
+ LDDE b5, 4 * SIZE(BO) # next B1R
|
|
+
|
|
+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd)
|
|
+ LDDE b6, 5 * SIZE(BO) # next B1I
|
|
+
|
|
+ VMAD a2,b1,c05,c05 # C31, C41
|
|
+ VLD a8,12 * SIZE(AO) # next A7, A8
|
|
+
|
|
+ VMAD a2,b2,c06,c06 # C31, C41
|
|
+ VLD a7, 8 * SIZE(AO) # next A5, A6
|
|
+
|
|
+ VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc)
|
|
+ VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd)
|
|
+ VMAD a3,b1,c09,c09 # C51, C61
|
|
+ VMAD a3,b2,c10,c10 # C51, C61
|
|
+
|
|
+
|
|
+ VMAD a2,b3,c07,c07 # C32, C42
|
|
+ LDDE b7, 6 * SIZE(BO) # next B2R
|
|
+
|
|
+ VMAD a2,b4,c08,c08 # C32, C42
|
|
+ LDDE b8, 7 * SIZE(BO) # next B2I
|
|
+
|
|
+ VMAD a4,b1,c13,c13 # C71, C81
|
|
+ VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0
|
|
+
|
|
+ VMAD a4,b2,c14,c14 # C71, C81
|
|
+ VLD a6, 4 * SIZE(AO) # next A3, A4
|
|
+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
|
|
+
|
|
+
|
|
+ VMAD a3,b3,c11,c11 # C52, C62
|
|
+ fillcs 0(PREB)
|
|
+
|
|
+ VMAD a3,b4,c12,c12 # C52, C62
|
|
+ fillcs 0(PREA)
|
|
+
|
|
+ VMAD a4,b3,c15,c15 # C72, C82
|
|
+ fillcs 8*SIZE(PREA)
|
|
+
|
|
+ VMAD a4,b4,c16,c16 # C72, C82
|
|
+ subl L, 1, L #
|
|
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
|
|
+
|
|
+ VMAD a8,b5,c13,c13
|
|
+ LDDE b1, 0 * SIZE(BO)
|
|
+
|
|
+ VMAD a8,b6,c14,c14
|
|
+ LDDE b2, 1 * SIZE(BO)
|
|
+
|
|
+ VMAD a7,b5,c09,c09
|
|
+ addl PREA, 16*SIZE, PREA
|
|
+ VLD a4,12 * SIZE(AO)
|
|
+
|
|
+ VMAD a7,b6,c10,c10
|
|
+ VLD a3, 8 * SIZE(AO)
|
|
+
|
|
+ VMAD a5,b5,c01,c01
|
|
+ VMAD a5,b6,c02,c02
|
|
+ VMAD a5,b7,c03,c03
|
|
+ VMAD a5,b8,c04,c04
|
|
+
|
|
+ VMAD a8,b7,c15,c15
|
|
+ LDDE b3, 2 * SIZE(BO)
|
|
+
|
|
+ VMAD a8,b8,c16,c16
|
|
+ LDDE b4, 3 * SIZE(BO)
|
|
+
|
|
+ VMAD a6,b5,c05,c05
|
|
+ VLD a1, 0 * SIZE(AO)
|
|
+
|
|
+ VMAD a6,b6,c06,c06
|
|
+ VLD a2, 4 * SIZE(AO)
|
|
+
|
|
+
|
|
+ VMAD a7,b7,c11,c11
|
|
+ fillcs 4*SIZE(PREB)
|
|
+
|
|
+ VMAD a7,b8,c12,c12
|
|
+ fillcs 0(PREA)
|
|
+
|
|
+ VMAD a6,b7,c07,c07
|
|
+ addl PREB, 8*SIZE, PREB
|
|
+ fillcs 8*SIZE(PREA)
|
|
+
|
|
+ VMAD a6,b8,c08,c08
|
|
+ addl PREA, 16*SIZE, PREA
|
|
+ bne L, $L12 # continue K
|
|
+
|
|
+$L15:
|
|
+ LD alpha_r, ALPHA_R # $f30==b8
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc K, $L18 # if(K&1)
|
|
+#else
|
|
+ blbc TEMP, $L18
|
|
+#endif
|
|
+
|
|
+$L16:
|
|
+ VMAD a1,b1,c01,c01 # C11R C21R
|
|
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
|
|
+
|
|
+ VMAD a1,b2,c02,c02 # C11I C21I
|
|
+ addl BO, 4*SIZE, BO
|
|
+
|
|
+ VMAD a1,b3,c03,c03 # C12R c22R
|
|
+ VMAD a1,b4,c04,c04 # C12I C22I
|
|
+
|
|
+ VMAD a2,b1,c05,c05 # C31R C41R
|
|
+ VMAD a2,b2,c06,c06 # C31I C41I
|
|
+ VMAD a2,b3,c07,c07 # C32R C42R
|
|
+ VMAD a2,b4,c08,c08 # C32I C42I
|
|
+
|
|
+ VMAD a3,b1,c09,c09 # C51R C61R
|
|
+ VMAD a3,b2,c10,c10 # C51I C61I
|
|
+ VMAD a3,b3,c11,c11 # C52R C62R
|
|
+ VMAD a3,b4,c12,c12 # C52I C62I
|
|
+
|
|
+ VMAD a4,b1,c13,c13 # C71R C81R
|
|
+ VMAD a4,b2,c14,c14 # C71I C81I
|
|
+ VMAD a4,b3,c15,c15 # C72R C82R
|
|
+ VMAD a4,b4,c16,c16 # C72I C82I
|
|
+
|
|
+$L18: # Write back
|
|
+ LD alpha_i, ALPHA_I # $f29==b7
|
|
+#ifndef TRMMKERNEL
|
|
+ vextf c01, 0, a1 # a1=C11R_ac
|
|
+ vextf c01, 1, a2 # a2=C11I_bc
|
|
+ vextf c01, 2, a3 # a3=C21R_ac
|
|
+ vextf c01, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c02, 0, b1 # b1=C11I_ad
|
|
+ vextf c02, 1, b2 # b2=C11R_bd
|
|
+ vextf c02, 2, b3 # b3=C21I_ad
|
|
+ vextf c02, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 0 * SIZE(C1)
|
|
+ LD a2, 1 * SIZE(C1)
|
|
+ LD a3, 2 * SIZE(C1)
|
|
+ LD a4, 3 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 0 * SIZE(C1)
|
|
+ ST c01, 1 * SIZE(C1)
|
|
+ ST b6, 2 * SIZE(C1)
|
|
+ ST c02, 3 * SIZE(C1)
|
|
+
|
|
+ vextf c05, 0, a1 # a1=C11R_ac
|
|
+ vextf c05, 1, a2 # a2=C11I_bc
|
|
+ vextf c05, 2, a3 # a3=C21R_ac
|
|
+ vextf c05, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c06, 0, b1 # b1=C11I_ad
|
|
+ vextf c06, 1, b2 # b2=C11R_bd
|
|
+ vextf c06, 2, b3 # b3=C21I_ad
|
|
+ vextf c06, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 4 * SIZE(C1)
|
|
+ LD a2, 5 * SIZE(C1)
|
|
+ LD a3, 6 * SIZE(C1)
|
|
+ LD a4, 7 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 4 * SIZE(C1)
|
|
+ ST c01, 5 * SIZE(C1)
|
|
+ ST b6, 6 * SIZE(C1)
|
|
+ ST c02, 7 * SIZE(C1)
|
|
+
|
|
+ vextf c09, 0, a1 # a1=C11R_ac
|
|
+ vextf c09, 1, a2 # a2=C11I_bc
|
|
+ vextf c09, 2, a3 # a3=C21R_ac
|
|
+ vextf c09, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c10, 0, b1 # b1=C11I_ad
|
|
+ vextf c10, 1, b2 # b2=C11R_bd
|
|
+ vextf c10, 2, b3 # b3=C21I_ad
|
|
+ vextf c10, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 8 * SIZE(C1)
|
|
+ LD a2, 9 * SIZE(C1)
|
|
+ LD a3, 10 * SIZE(C1)
|
|
+ LD a4, 11 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 8 * SIZE(C1)
|
|
+ ST c01, 9 * SIZE(C1)
|
|
+ ST b6, 10 * SIZE(C1)
|
|
+ ST c02, 11 * SIZE(C1)
|
|
+
|
|
+ vextf c13, 0, a1 # a1=C11R_ac
|
|
+ vextf c13, 1, a2 # a2=C11I_bc
|
|
+ vextf c13, 2, a3 # a3=C21R_ac
|
|
+ vextf c13, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c14, 0, b1 # b1=C11I_ad
|
|
+ vextf c14, 1, b2 # b2=C11R_bd
|
|
+ vextf c14, 2, b3 # b3=C21I_ad
|
|
+ vextf c14, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 12 * SIZE(C1)
|
|
+ LD a2, 13 * SIZE(C1)
|
|
+ LD a3, 14 * SIZE(C1)
|
|
+ LD a4, 15 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 12 * SIZE(C1)
|
|
+ ST c01, 13 * SIZE(C1)
|
|
+ ST b6, 14 * SIZE(C1)
|
|
+ ST c02, 15 * SIZE(C1)
|
|
+
|
|
+
|
|
+ vextf c03, 0, a1 # a1=C11R_ac
|
|
+ vextf c03, 1, a2 # a2=C11I_bc
|
|
+ vextf c03, 2, a3 # a3=C21R_ac
|
|
+ vextf c03, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c04, 0, b1 # b1=C11I_ad
|
|
+ vextf c04, 1, b2 # b2=C11R_bd
|
|
+ vextf c04, 2, b3 # b3=C21I_ad
|
|
+ vextf c04, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 0 * SIZE(C2)
|
|
+ LD a2, 1 * SIZE(C2)
|
|
+ LD a3, 2 * SIZE(C2)
|
|
+ LD a4, 3 * SIZE(C2)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 0 * SIZE(C2)
|
|
+ ST c02, 1 * SIZE(C2)
|
|
+ ST c05, 2 * SIZE(C2)
|
|
+ ST c06, 3 * SIZE(C2)
|
|
+
|
|
+ vextf c07, 0, a1 # a1=C11R_ac
|
|
+ vextf c07, 1, a2 # a2=C11I_bc
|
|
+ vextf c07, 2, a3 # a3=C21R_ac
|
|
+ vextf c07, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c08, 0, b1 # b1=C11I_ad
|
|
+ vextf c08, 1, b2 # b2=C11R_bd
|
|
+ vextf c08, 2, b3 # b3=C21I_ad
|
|
+ vextf c08, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 4 * SIZE(C2)
|
|
+ LD a2, 5 * SIZE(C2)
|
|
+ LD a3, 6 * SIZE(C2)
|
|
+ LD a4, 7 * SIZE(C2)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 4 * SIZE(C2)
|
|
+ ST c02, 5 * SIZE(C2)
|
|
+ ST c05, 6 * SIZE(C2)
|
|
+ ST c06, 7 * SIZE(C2)
|
|
+
|
|
+ vextf c11, 0, a1 # a1=C11R_ac
|
|
+ vextf c11, 1, a2 # a2=C11I_bc
|
|
+ vextf c11, 2, a3 # a3=C21R_ac
|
|
+ vextf c11, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c12, 0, b1 # b1=C11I_ad
|
|
+ vextf c12, 1, b2 # b2=C11R_bd
|
|
+ vextf c12, 2, b3 # b3=C21I_ad
|
|
+ vextf c12, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 8 * SIZE(C2)
|
|
+ LD a2, 9 * SIZE(C2)
|
|
+ LD a3, 10 * SIZE(C2)
|
|
+ LD a4, 11 * SIZE(C2)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 8 * SIZE(C2)
|
|
+ ST c02, 9 * SIZE(C2)
|
|
+ ST c05, 10 * SIZE(C2)
|
|
+ ST c06, 11 * SIZE(C2)
|
|
+
|
|
+ vextf c15, 0, a1 # a1=C11R_ac
|
|
+ vextf c15, 1, a2 # a2=C11I_bc
|
|
+ vextf c15, 2, a3 # a3=C21R_ac
|
|
+ vextf c15, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c16, 0, b1 # b1=C11I_ad
|
|
+ vextf c16, 1, b2 # b2=C11R_bd
|
|
+ vextf c16, 2, b3 # b3=C21I_ad
|
|
+ vextf c16, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 12 * SIZE(C2)
|
|
+ LD a2, 13 * SIZE(C2)
|
|
+ LD a3, 14 * SIZE(C2)
|
|
+ LD a4, 15 * SIZE(C2)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 12 * SIZE(C2)
|
|
+ ST c02, 13 * SIZE(C2)
|
|
+ ST c05, 14 * SIZE(C2)
|
|
+ ST c06, 15 * SIZE(C2)
|
|
+
|
|
+#else
|
|
+ vextf c01, 0, a1 # a1=C11R_ac
|
|
+ vextf c01, 1, a2 # a2=C11I_bc
|
|
+ vextf c01, 2, a3 # a3=C21R_ac
|
|
+ vextf c01, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c02, 0, b1 # b1=C11I_ad
|
|
+ vextf c02, 1, b2 # b2=C11R_bd
|
|
+ vextf c02, 2, b3 # b3=C21I_ad
|
|
+ vextf c02, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 0 * SIZE(C1)
|
|
+ ST c01, 1 * SIZE(C1)
|
|
+ ST b6, 2 * SIZE(C1)
|
|
+ ST c02, 3 * SIZE(C1)
|
|
+
|
|
+ vextf c05, 0, a1 # a1=C11R_ac
|
|
+ vextf c05, 1, a2 # a2=C11I_bc
|
|
+ vextf c05, 2, a3 # a3=C21R_ac
|
|
+ vextf c05, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c06, 0, b1 # b1=C11I_ad
|
|
+ vextf c06, 1, b2 # b2=C11R_bd
|
|
+ vextf c06, 2, b3 # b3=C21I_ad
|
|
+ vextf c06, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 4 * SIZE(C1)
|
|
+ ST c01, 5 * SIZE(C1)
|
|
+ ST b6, 6 * SIZE(C1)
|
|
+ ST c02, 7 * SIZE(C1)
|
|
+
|
|
+ vextf c09, 0, a1 # a1=C11R_ac
|
|
+ vextf c09, 1, a2 # a2=C11I_bc
|
|
+ vextf c09, 2, a3 # a3=C21R_ac
|
|
+ vextf c09, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c10, 0, b1 # b1=C11I_ad
|
|
+ vextf c10, 1, b2 # b2=C11R_bd
|
|
+ vextf c10, 2, b3 # b3=C21I_ad
|
|
+ vextf c10, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 8 * SIZE(C1)
|
|
+ ST c01, 9 * SIZE(C1)
|
|
+ ST b6, 10 * SIZE(C1)
|
|
+ ST c02, 11 * SIZE(C1)
|
|
+
|
|
+ vextf c13, 0, a1 # a1=C11R_ac
|
|
+ vextf c13, 1, a2 # a2=C11I_bc
|
|
+ vextf c13, 2, a3 # a3=C21R_ac
|
|
+ vextf c13, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c14, 0, b1 # b1=C11I_ad
|
|
+ vextf c14, 1, b2 # b2=C11R_bd
|
|
+ vextf c14, 2, b3 # b3=C21I_ad
|
|
+ vextf c14, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 12 * SIZE(C1)
|
|
+ ST c01, 13 * SIZE(C1)
|
|
+ ST b6, 14 * SIZE(C1)
|
|
+ ST c02, 15 * SIZE(C1)
|
|
+
|
|
+
|
|
+ vextf c03, 0, a1 # a1=C11R_ac
|
|
+ vextf c03, 1, a2 # a2=C11I_bc
|
|
+ vextf c03, 2, a3 # a3=C21R_ac
|
|
+ vextf c03, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c04, 0, b1 # b1=C11I_ad
|
|
+ vextf c04, 1, b2 # b2=C11R_bd
|
|
+ vextf c04, 2, b3 # b3=C21I_ad
|
|
+ vextf c04, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 0 * SIZE(C2)
|
|
+ ST c02, 1 * SIZE(C2)
|
|
+ ST c05, 2 * SIZE(C2)
|
|
+ ST c06, 3 * SIZE(C2)
|
|
+
|
|
+ vextf c07, 0, a1 # a1=C11R_ac
|
|
+ vextf c07, 1, a2 # a2=C11I_bc
|
|
+ vextf c07, 2, a3 # a3=C21R_ac
|
|
+ vextf c07, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c08, 0, b1 # b1=C11I_ad
|
|
+ vextf c08, 1, b2 # b2=C11R_bd
|
|
+ vextf c08, 2, b3 # b3=C21I_ad
|
|
+ vextf c08, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 4 * SIZE(C2)
|
|
+ ST c02, 5 * SIZE(C2)
|
|
+ ST c05, 6 * SIZE(C2)
|
|
+ ST c06, 7 * SIZE(C2)
|
|
+
|
|
+ vextf c11, 0, a1 # a1=C11R_ac
|
|
+ vextf c11, 1, a2 # a2=C11I_bc
|
|
+ vextf c11, 2, a3 # a3=C21R_ac
|
|
+ vextf c11, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c12, 0, b1 # b1=C11I_ad
|
|
+ vextf c12, 1, b2 # b2=C11R_bd
|
|
+ vextf c12, 2, b3 # b3=C21I_ad
|
|
+ vextf c12, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 8 * SIZE(C2)
|
|
+ ST c02, 9 * SIZE(C2)
|
|
+ ST c05, 10 * SIZE(C2)
|
|
+ ST c06, 11 * SIZE(C2)
|
|
+
|
|
+ vextf c15, 0, a1 # a1=C11R_ac
|
|
+ vextf c15, 1, a2 # a2=C11I_bc
|
|
+ vextf c15, 2, a3 # a3=C21R_ac
|
|
+ vextf c15, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c16, 0, b1 # b1=C11I_ad
|
|
+ vextf c16, 1, b2 # b2=C11R_bd
|
|
+ vextf c16, 2, b3 # b3=C21I_ad
|
|
+ vextf c16, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 12 * SIZE(C2)
|
|
+ ST c02, 13 * SIZE(C2)
|
|
+ ST c05, 14 * SIZE(C2)
|
|
+ ST c06, 15 * SIZE(C2)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 8, TEMP
|
|
+#else
|
|
+ subl TEMP, 2, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 3 + ZBASE_SHIFT,L # mr=8
|
|
+ sll TEMP, 1 + ZBASE_SHIFT,TEMP # nr=2
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl BO, TEMP, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK,8,KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ jmp $L09
|
|
+
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L20: # N=2, M=4
|
|
+ and M, 4, I # I=M&4
|
|
+ ble I, $L30
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B, BO
|
|
+ nop
|
|
+#else
|
|
+ sll KK, 2 + ZBASE_SHIFT, L # mr=4
|
|
+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl B, TEMP, BO
|
|
+#endif
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+ fillcs 8*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c01 # Clear result regs
|
|
+ vcpys $f31,$f31,c02
|
|
+ vcpys $f31,$f31,c03
|
|
+ vcpys $f31,$f31,c04
|
|
+
|
|
+ fillcs 0(C2)
|
|
+ fillcs 4*SIZE(C2)
|
|
+ fillcs 8*SIZE(C2)
|
|
+
|
|
+ vcpys $f31,$f31,c05
|
|
+ vcpys $f31,$f31,c06
|
|
+ vcpys $f31,$f31,c07
|
|
+ vcpys $f31,$f31,c08
|
|
+
|
|
+ LDDE b1, 0 * SIZE(BO) # B1R
|
|
+ LDDE b2, 1 * SIZE(BO) # B1I
|
|
+ LDDE b3, 2 * SIZE(BO) # B2R
|
|
+ LDDE b4, 3 * SIZE(BO) # B2I
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO) # A1, A2
|
|
+ VLD a2, 4 * SIZE(AO) # A3, A4
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 4, TEMP # mr=4
|
|
+#else
|
|
+ addl KK, 2,TEMP # nr=2
|
|
+#endif
|
|
+ sra TEMP, 1, L
|
|
+ ble L, $L25
|
|
+
|
|
+#else
|
|
+ mov B, BO # Set B, (block A x panel Bj)
|
|
+ sra K, 1, L # Unroll K as 2
|
|
+
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+ fillcs 8*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c01 # Clear result regs
|
|
+ vcpys $f31,$f31,c02
|
|
+ vcpys $f31,$f31,c03
|
|
+ vcpys $f31,$f31,c04
|
|
+
|
|
+ fillcs 0(C2)
|
|
+ fillcs 4*SIZE(C2)
|
|
+ fillcs 8*SIZE(C2)
|
|
+
|
|
+ vcpys $f31,$f31,c05
|
|
+ vcpys $f31,$f31,c06
|
|
+ vcpys $f31,$f31,c07
|
|
+ vcpys $f31,$f31,c08
|
|
+
|
|
+ LDDE b1, 0 * SIZE(BO) # B1R
|
|
+ LDDE b2, 1 * SIZE(BO) # B1I
|
|
+ LDDE b3, 2 * SIZE(BO) # B2R
|
|
+ LDDE b4, 3 * SIZE(BO) # B2I
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO) # A1, A2
|
|
+ VLD a2, 4 * SIZE(AO) # A3, A4
|
|
+
|
|
+ ble L, $L25
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$L22:
|
|
+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc)
|
|
+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd)
|
|
+ VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc)
|
|
+ VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd)
|
|
+
|
|
+ LDDE b5, 4 * SIZE(BO) # next B1R
|
|
+ LDDE b6, 5 * SIZE(BO) # next B1I
|
|
+ LDDE b7, 6 * SIZE(BO) # next B2R
|
|
+ LDDE b8, 7 * SIZE(BO) # next B2I
|
|
+
|
|
+ fillcs 0(PREB)
|
|
+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
|
|
+ VMAD a2,b1,c05,c05 # C31, C41
|
|
+ VMAD a2,b2,c06,c06 # C31, C41
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ VMAD a2,b3,c07,c07 # C32, C42
|
|
+ VMAD a2,b4,c08,c08 # C32, C42
|
|
+
|
|
+ VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0
|
|
+ VLD a6, 12 * SIZE(AO) # next A3, A4
|
|
+
|
|
+ subl L, 1, L #
|
|
+
|
|
+ addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE
|
|
+ VMAD a5,b5,c01,c01
|
|
+ VMAD a5,b6,c02,c02
|
|
+
|
|
+ addl PREA, 16*SIZE, PREA
|
|
+ VMAD a5,b7,c03,c03
|
|
+ VMAD a5,b8,c04,c04
|
|
+
|
|
+ LDDE b1, 0 * SIZE(BO)
|
|
+ LDDE b2, 1 * SIZE(BO)
|
|
+ LDDE b3, 2 * SIZE(BO)
|
|
+ LDDE b4, 3 * SIZE(BO)
|
|
+
|
|
+ fillcs 4*SIZE(PREB)
|
|
+ VMAD a6,b5,c05,c05
|
|
+ VMAD a6,b6,c06,c06
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ VMAD a6,b7,c07,c07
|
|
+ VMAD a6,b8,c08,c08
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO)
|
|
+ VLD a2, 4 * SIZE(AO)
|
|
+
|
|
+ addl PREB, 8*SIZE, PREB
|
|
+ addl PREA, 16*SIZE, PREA
|
|
+ bne L, $L22 # continue K
|
|
+
|
|
+$L25:
|
|
+ LD alpha_r, ALPHA_R # $f30==b8
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc K, $L28 # if(K&1)
|
|
+#else
|
|
+ blbc TEMP, $L28
|
|
+#endif
|
|
+
|
|
+$L26:
|
|
+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE
|
|
+ VMAD a1,b1,c01,c01 # C11R C21R
|
|
+ VMAD a1,b2,c02,c02 # C11I C21I
|
|
+ VMAD a1,b3,c03,c03 # C12R c22R
|
|
+ VMAD a1,b4,c04,c04 # C12I C22I
|
|
+
|
|
+ addl BO, 4*SIZE, BO
|
|
+ VMAD a2,b1,c05,c05 # C31R C41R
|
|
+ VMAD a2,b2,c06,c06 # C31I C41I
|
|
+ VMAD a2,b3,c07,c07 # C32R C42R
|
|
+ VMAD a2,b4,c08,c08 # C32I C42I
|
|
+
|
|
+$L28: # Write back
|
|
+ LD alpha_i, ALPHA_I # $f29==b7
|
|
+#ifndef TRMMKERNEL
|
|
+ vextf c01, 0, a1 # a1=C11R_ac
|
|
+ vextf c01, 1, a2 # a2=C11I_bc
|
|
+ vextf c01, 2, a3 # a3=C21R_ac
|
|
+ vextf c01, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c02, 0, b1 # b1=C11I_ad
|
|
+ vextf c02, 1, b2 # b2=C11R_bd
|
|
+ vextf c02, 2, b3 # b3=C21I_ad
|
|
+ vextf c02, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 0 * SIZE(C1)
|
|
+ LD a2, 1 * SIZE(C1)
|
|
+ LD a3, 2 * SIZE(C1)
|
|
+ LD a4, 3 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 0 * SIZE(C1)
|
|
+ ST c01, 1 * SIZE(C1)
|
|
+ ST b6, 2 * SIZE(C1)
|
|
+ ST c02, 3 * SIZE(C1)
|
|
+
|
|
+ vextf c05, 0, a1 # a1=C11R_ac
|
|
+ vextf c05, 1, a2 # a2=C11I_bc
|
|
+ vextf c05, 2, a3 # a3=C21R_ac
|
|
+ vextf c05, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c06, 0, b1 # b1=C11I_ad
|
|
+ vextf c06, 1, b2 # b2=C11R_bd
|
|
+ vextf c06, 2, b3 # b3=C21I_ad
|
|
+ vextf c06, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 4 * SIZE(C1)
|
|
+ LD a2, 5 * SIZE(C1)
|
|
+ LD a3, 6 * SIZE(C1)
|
|
+ LD a4, 7 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 4 * SIZE(C1)
|
|
+ ST c01, 5 * SIZE(C1)
|
|
+ ST b6, 6 * SIZE(C1)
|
|
+ ST c02, 7 * SIZE(C1)
|
|
+
|
|
+
|
|
+ vextf c03, 0, a1 # a1=C11R_ac
|
|
+ vextf c03, 1, a2 # a2=C11I_bc
|
|
+ vextf c03, 2, a3 # a3=C21R_ac
|
|
+ vextf c03, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c04, 0, b1 # b1=C11I_ad
|
|
+ vextf c04, 1, b2 # b2=C11R_bd
|
|
+ vextf c04, 2, b3 # b3=C21I_ad
|
|
+ vextf c04, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 0 * SIZE(C2)
|
|
+ LD a2, 1 * SIZE(C2)
|
|
+ LD a3, 2 * SIZE(C2)
|
|
+ LD a4, 3 * SIZE(C2)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 0 * SIZE(C2)
|
|
+ ST c02, 1 * SIZE(C2)
|
|
+ ST c05, 2 * SIZE(C2)
|
|
+ ST c06, 3 * SIZE(C2)
|
|
+
|
|
+ vextf c07, 0, a1 # a1=C11R_ac
|
|
+ vextf c07, 1, a2 # a2=C11I_bc
|
|
+ vextf c07, 2, a3 # a3=C21R_ac
|
|
+ vextf c07, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c08, 0, b1 # b1=C11I_ad
|
|
+ vextf c08, 1, b2 # b2=C11R_bd
|
|
+ vextf c08, 2, b3 # b3=C21I_ad
|
|
+ vextf c08, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 4 * SIZE(C2)
|
|
+ LD a2, 5 * SIZE(C2)
|
|
+ LD a3, 6 * SIZE(C2)
|
|
+ LD a4, 7 * SIZE(C2)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 4 * SIZE(C2)
|
|
+ ST c02, 5 * SIZE(C2)
|
|
+ ST c05, 6 * SIZE(C2)
|
|
+ ST c06, 7 * SIZE(C2)
|
|
+
|
|
+#else
|
|
+
|
|
+ vextf c01, 0, a1 # a1=C11R_ac
|
|
+ vextf c01, 1, a2 # a2=C11I_bc
|
|
+ vextf c01, 2, a3 # a3=C21R_ac
|
|
+ vextf c01, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c02, 0, b1 # b1=C11I_ad
|
|
+ vextf c02, 1, b2 # b2=C11R_bd
|
|
+ vextf c02, 2, b3 # b3=C21I_ad
|
|
+ vextf c02, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 0 * SIZE(C1)
|
|
+ ST c01, 1 * SIZE(C1)
|
|
+ ST b6, 2 * SIZE(C1)
|
|
+ ST c02, 3 * SIZE(C1)
|
|
+
|
|
+ vextf c05, 0, a1 # a1=C11R_ac
|
|
+ vextf c05, 1, a2 # a2=C11I_bc
|
|
+ vextf c05, 2, a3 # a3=C21R_ac
|
|
+ vextf c05, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c06, 0, b1 # b1=C11I_ad
|
|
+ vextf c06, 1, b2 # b2=C11R_bd
|
|
+ vextf c06, 2, b3 # b3=C21I_ad
|
|
+ vextf c06, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 4 * SIZE(C1)
|
|
+ ST c01, 5 * SIZE(C1)
|
|
+ ST b6, 6 * SIZE(C1)
|
|
+ ST c02, 7 * SIZE(C1)
|
|
+
|
|
+
|
|
+ vextf c03, 0, a1 # a1=C11R_ac
|
|
+ vextf c03, 1, a2 # a2=C11I_bc
|
|
+ vextf c03, 2, a3 # a3=C21R_ac
|
|
+ vextf c03, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c04, 0, b1 # b1=C11I_ad
|
|
+ vextf c04, 1, b2 # b2=C11R_bd
|
|
+ vextf c04, 2, b3 # b3=C21I_ad
|
|
+ vextf c04, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 0 * SIZE(C2)
|
|
+ ST c02, 1 * SIZE(C2)
|
|
+ ST c05, 2 * SIZE(C2)
|
|
+ ST c06, 3 * SIZE(C2)
|
|
+
|
|
+ vextf c07, 0, a1 # a1=C11R_ac
|
|
+ vextf c07, 1, a2 # a2=C11I_bc
|
|
+ vextf c07, 2, a3 # a3=C21R_ac
|
|
+ vextf c07, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c08, 0, b1 # b1=C11I_ad
|
|
+ vextf c08, 1, b2 # b2=C11R_bd
|
|
+ vextf c08, 2, b3 # b3=C21I_ad
|
|
+ vextf c08, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, c01
|
|
+ FMAD8 a8, alpha_i, a3, c05
|
|
+ FMAD6 b5, alpha_i, a2, c02
|
|
+ FMAD6 a6, alpha_i, a4, c06
|
|
+
|
|
+ ST c01, 4 * SIZE(C2)
|
|
+ ST c02, 5 * SIZE(C2)
|
|
+ ST c05, 6 * SIZE(C2)
|
|
+ ST c06, 7 * SIZE(C2)
|
|
+
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 4, TEMP
|
|
+#else
|
|
+ subl TEMP, 2, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 2 + ZBASE_SHIFT, L
|
|
+ sll TEMP, 1 + ZBASE_SHIFT, TEMP
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl BO, TEMP,BO
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 4,KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ addl C1, 8*SIZE, C1
|
|
+ addl C2, 8*SIZE, C2
|
|
+
|
|
+
|
|
+ .align 4
|
|
+$L30:
|
|
+ and M, 2, I # I=M&2
|
|
+ ble I, $L40
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B, BO
|
|
+ nop
|
|
+#else
|
|
+ sll KK, 1 + ZBASE_SHIFT, L # mr=2
|
|
+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl B, TEMP, BO
|
|
+#endif
|
|
+
|
|
+ fclr c01
|
|
+ fclr c02
|
|
+ fclr c03
|
|
+ fclr c04
|
|
+ fclr c05
|
|
+ fclr c06
|
|
+ fclr c07
|
|
+ fclr c08 # CLEAR 8 register
|
|
+ fclr c09
|
|
+ fclr c10
|
|
+ fclr c11
|
|
+ fclr c12
|
|
+ fclr c13
|
|
+ fclr c14
|
|
+ fclr c15
|
|
+ fclr c16
|
|
+
|
|
+ fillcs 0*SIZE(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+
|
|
+ LD b1, 0*SIZE(BO) # b1 real part
|
|
+ LD b2, 1*SIZE(BO) # b1 image part
|
|
+ LD b3, 2*SIZE(BO) # b2 real part
|
|
+ LD b4, 3*SIZE(BO) # b2 image part
|
|
+
|
|
+ fillcs 0*SIZE(C2)
|
|
+ fillcs 4*SIZE(C2)
|
|
+
|
|
+ LD a1, 0*SIZE(AO) # a1 real part
|
|
+ LD a2, 1*SIZE(AO) # a1 image part
|
|
+ LD a3, 2*SIZE(AO) # a2 real part
|
|
+ LD a4, 3*SIZE(AO) # a2 image part
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 2, TEMP # mr=2
|
|
+#else
|
|
+ addl KK, 2, TEMP # nr=2
|
|
+#endif
|
|
+ sra TEMP, 1, L
|
|
+ ble L, $L35
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B, BO # Set B, (block A x panel Bj)
|
|
+ sra K, 1, L # Unroll K as 2
|
|
+
|
|
+ fclr c01
|
|
+ fclr c02
|
|
+ fclr c03
|
|
+ fclr c04
|
|
+ fclr c05
|
|
+ fclr c06
|
|
+ fclr c07
|
|
+ fclr c08 # CLEAR 8 register
|
|
+ fclr c09
|
|
+ fclr c10
|
|
+ fclr c11
|
|
+ fclr c12
|
|
+ fclr c13
|
|
+ fclr c14
|
|
+ fclr c15
|
|
+ fclr c16
|
|
+
|
|
+ fillcs 0*SIZE(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+
|
|
+ LD b1, 0*SIZE(BO) # b1 real part
|
|
+ LD b2, 1*SIZE(BO) # b1 image part
|
|
+ LD b3, 2*SIZE(BO) # b2 real part
|
|
+ LD b4, 3*SIZE(BO) # b2 image part
|
|
+
|
|
+ fillcs 0*SIZE(C2)
|
|
+ fillcs 4*SIZE(C2)
|
|
+
|
|
+ LD a1, 0*SIZE(AO) # a1 real part
|
|
+ LD a2, 1*SIZE(AO) # a1 image part
|
|
+ LD a3, 2*SIZE(AO) # a2 real part
|
|
+ LD a4, 3*SIZE(AO) # a2 image part
|
|
+
|
|
+ ble L, $L35
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$L32:
|
|
+ MAD a1,b1,c01,c01 # a1*c1
|
|
+ MAD a1,b2,c02,c02 # a1*d1
|
|
+ MAD a1,b3,c03,c03 # a1*c2
|
|
+ MAD a1,b4,c04,c04 # a1*d2
|
|
+
|
|
+ LD b5, 4 * SIZE(BO) # next B1R
|
|
+ LD b6, 5 * SIZE(BO) # next B1I
|
|
+ LD b7, 6 * SIZE(BO) # next B2R
|
|
+ LD b8, 7 * SIZE(BO) # next B2I
|
|
+
|
|
+ LD a5, 4 * SIZE(AO) # next A1-A4 real part
|
|
+ LD a6, 5 * SIZE(AO) # next A1-A4 image part
|
|
+ LD a7, 6 * SIZE(AO)
|
|
+ LD a8, 7 * SIZE(AO)
|
|
+
|
|
+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
|
|
+ MAD a2,b1,c05,c05 # b1*c1
|
|
+ MAD a2,b2,c06,c06 # b1*d1
|
|
+ MAD a2,b3,c07,c07 # b1*c2
|
|
+ MAD a2,b4,c08,c08 # b1*d2
|
|
+
|
|
+ MAD a3,b1,c09,c09 # a2*c1
|
|
+ MAD a3,b2,c10,c10 # a2*d1
|
|
+ MAD a3,b3,c11,c11 # a2*c2
|
|
+ MAD a3,b4,c12,c12 # a2*d2
|
|
+
|
|
+ MAD a4,b1,c13,c13 # b2*c1
|
|
+ MAD a4,b2,c14,c14 # b2*d1
|
|
+ MAD a4,b3,c15,c15 # b2*c2
|
|
+ MAD a4,b4,c16,c16 # b2*d2
|
|
+
|
|
+ subl L, 1, L #
|
|
+
|
|
+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE
|
|
+ MAD a5,b5,c01,c01
|
|
+ MAD a5,b6,c02,c02
|
|
+ MAD a5,b7,c03,c03
|
|
+ MAD a5,b8,c04,c04
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MAD a6,b5,c05,c05
|
|
+ MAD a6,b6,c06,c06
|
|
+ MAD a6,b7,c07,c07
|
|
+ MAD a6,b8,c08,c08
|
|
+
|
|
+ MAD a7,b5,c09,c09
|
|
+ MAD a7,b6,c10,c10
|
|
+ MAD a7,b7,c11,c11
|
|
+ MAD a7,b8,c12,c12
|
|
+
|
|
+ MAD a8,b5,c13,c13
|
|
+ MAD a8,b6,c14,c14
|
|
+ MAD a8,b7,c15,c15
|
|
+ MAD a8,b8,c16,c16
|
|
+
|
|
+ bne L, $L32 # continue K
|
|
+
|
|
+$L35:
|
|
+ LD alpha_r, ALPHA_R # $f30==b8
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc K, $L38 # if(K&1)
|
|
+#else
|
|
+ blbc TEMP, $L38
|
|
+#endif
|
|
+
|
|
+$L36:
|
|
+ addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE
|
|
+ addl BO, 4*SIZE, BO
|
|
+
|
|
+ MAD a1,b1,c01,c01 # a1*c1
|
|
+ MAD a1,b2,c02,c02 # a1*d1
|
|
+ MAD a1,b3,c03,c03 # a1*c2
|
|
+ MAD a1,b4,c04,c04 # a1*d2
|
|
+
|
|
+ MAD a2,b1,c05,c05 # b1*c1
|
|
+ MAD a2,b2,c06,c06 # b1*d1
|
|
+ MAD a2,b3,c07,c07 # b1*c2
|
|
+ MAD a2,b4,c08,c08 # b1*d2
|
|
+
|
|
+ MAD a3,b1,c09,c09 # a2*c1
|
|
+ MAD a3,b2,c10,c10 # a2*d1
|
|
+ MAD a3,b3,c11,c11 # a2*c2
|
|
+ MAD a3,b4,c12,c12 # a2*d2
|
|
+
|
|
+ MAD a4,b1,c13,c13 # b2*c1
|
|
+ MAD a4,b2,c14,c14 # b2*d1
|
|
+ MAD a4,b3,c15,c15 # b2*c2
|
|
+ MAD a4,b4,c16,c16 # b2*d2
|
|
+
|
|
+
|
|
+
|
|
+$L38: # Write back
|
|
+ LD alpha_i, ALPHA_I # $f29==b7
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD1 c01, c06, c01 # ac '+' bd
|
|
+ ADD1 c09, c14, c09
|
|
+ ADD1 c03, c08, c03 #
|
|
+ ADD1 c11, c16, c11
|
|
+
|
|
+ ADD2 c05, c02, c02 # bc '+' ad
|
|
+ ADD2 c13, c10, c10
|
|
+ ADD2 c07, c04, c04
|
|
+ ADD2 c15, c12, c12
|
|
+
|
|
+ LD b1, 0 * SIZE(C1)
|
|
+ LD b2, 1 * SIZE(C1)
|
|
+ LD b3, 2 * SIZE(C1)
|
|
+ LD b4, 3 * SIZE(C1)
|
|
+
|
|
+ LD a5, 0 * SIZE(C2)
|
|
+ LD a6, 1 * SIZE(C2)
|
|
+ LD a7, 2 * SIZE(C2)
|
|
+ LD a8, 3 * SIZE(C2)
|
|
+
|
|
+ FMAD5 c01, alpha_r, b1, b1
|
|
+ FMAD5 c09, alpha_r, b3, b3
|
|
+ FMAD5 c03, alpha_r, a5, a5
|
|
+ FMAD5 c11, alpha_r, a7, a7
|
|
+
|
|
+ FMAD7 c02, alpha_r, b2, b2
|
|
+ FMAD7 c10, alpha_r, b4, b4
|
|
+ FMAD7 c04, alpha_r, a6, a6
|
|
+ FMAD7 c12, alpha_r, a8, a8
|
|
+
|
|
+ FMAD8 c02, alpha_i, b1, b1
|
|
+ FMAD8 c10, alpha_i, b3, b3
|
|
+ FMAD8 c04, alpha_i, a5, a5
|
|
+ FMAD8 c12, alpha_i, a7, a7
|
|
+
|
|
+ FMAD6 c01, alpha_i, b2, b2
|
|
+ FMAD6 c09, alpha_i, b4, b4
|
|
+ FMAD6 c03, alpha_i, a6, a6
|
|
+ FMAD6 c11, alpha_i, a8, a8
|
|
+
|
|
+ ST b1, 0 * SIZE(C1)
|
|
+ ST b2, 1 * SIZE(C1)
|
|
+ ST b3, 2 * SIZE(C1)
|
|
+ ST b4, 3 * SIZE(C1)
|
|
+
|
|
+ ST a5, 0 * SIZE(C2)
|
|
+ ST a6, 1 * SIZE(C2)
|
|
+ ST a7, 2 * SIZE(C2)
|
|
+ ST a8, 3 * SIZE(C2)
|
|
+
|
|
+#else
|
|
+
|
|
+ ADD1 c01, c06, c01 # ac '+' bd
|
|
+ ADD1 c09, c14, c09
|
|
+ ADD1 c03, c08, c03 #
|
|
+ ADD1 c11, c16, c11
|
|
+
|
|
+ ADD2 c05, c02, c02 # bc '+' ad
|
|
+ ADD2 c13, c10, c10
|
|
+ ADD2 c07, c04, c04
|
|
+ ADD2 c15, c12, c12
|
|
+
|
|
+ FMAD5 c01, alpha_r, $f31, b1
|
|
+ FMAD5 c09, alpha_r, $f31, b3
|
|
+ FMAD5 c03, alpha_r, $f31, a5
|
|
+ FMAD5 c11, alpha_r, $f31, a7
|
|
+
|
|
+ FMAD7 c02, alpha_r, $f31, b2
|
|
+ FMAD7 c10, alpha_r, $f31, b4
|
|
+ FMAD7 c04, alpha_r, $f31, a6
|
|
+ FMAD7 c12, alpha_r, $f31, a8
|
|
+
|
|
+ FMAD8 c02, alpha_i, b1, b1
|
|
+ FMAD8 c10, alpha_i, b3, b3
|
|
+ FMAD8 c04, alpha_i, a5, a5
|
|
+ FMAD8 c12, alpha_i, a7, a7
|
|
+
|
|
+ FMAD6 c01, alpha_i, b2, b2
|
|
+ FMAD6 c09, alpha_i, b4, b4
|
|
+ FMAD6 c03, alpha_i, a6, a6
|
|
+ FMAD6 c11, alpha_i, a8, a8
|
|
+
|
|
+ ST b1, 0 * SIZE(C1)
|
|
+ ST b2, 1 * SIZE(C1)
|
|
+ ST b3, 2 * SIZE(C1)
|
|
+ ST b4, 3 * SIZE(C1)
|
|
+
|
|
+ ST a5, 0 * SIZE(C2)
|
|
+ ST a6, 1 * SIZE(C2)
|
|
+ ST a7, 2 * SIZE(C2)
|
|
+ ST a8, 3 * SIZE(C2)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 2, TEMP
|
|
+#else
|
|
+ subl TEMP, 2, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 1 + ZBASE_SHIFT, L
|
|
+ sll TEMP, 1 + ZBASE_SHIFT, TEMP
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl BO, TEMP, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ addl C1, 4*SIZE, C1
|
|
+ addl C2, 4*SIZE, C2
|
|
+
|
|
+
|
|
+ .align 4
|
|
+$L40:
|
|
+ and M, 1, I # I=M&1
|
|
+ ble I, $L09
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B, BO
|
|
+ nop
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT, L # mr=1
|
|
+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl B, TEMP, BO
|
|
+#endif
|
|
+
|
|
+ fillcs 0*SIZE(C1)
|
|
+ fillcs 0*SIZE(C2)
|
|
+
|
|
+ fclr c01
|
|
+ fclr c02
|
|
+ fclr c03
|
|
+ fclr c04
|
|
+ fclr c05
|
|
+ fclr c06
|
|
+ fclr c07
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0*SIZE(BO) # b1 real part
|
|
+ LD b2, 1*SIZE(BO) # b1 image part
|
|
+ LD b3, 2*SIZE(BO) # b2 real part
|
|
+ LD b4, 3*SIZE(BO) # b2 image part
|
|
+
|
|
+ LD a1, 0*SIZE(AO) # a1 real part
|
|
+ LD a2, 1*SIZE(AO) # a1 image part
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 1, TEMP # mr=1
|
|
+#else
|
|
+ addl KK, 2, TEMP # nr=2
|
|
+#endif
|
|
+ sra TEMP, 1, L
|
|
+
|
|
+ ble L, $L45
|
|
+
|
|
+#else
|
|
+ mov B, BO # Set B, (block A x panel Bj)
|
|
+ sra K, 1, L # Unroll K as 2
|
|
+
|
|
+ fillcs 0*SIZE(C1)
|
|
+ fillcs 0*SIZE(C2)
|
|
+
|
|
+ fclr c01
|
|
+ fclr c02
|
|
+ fclr c03
|
|
+ fclr c04
|
|
+ fclr c05
|
|
+ fclr c06
|
|
+ fclr c07
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0*SIZE(BO) # b1 real part
|
|
+ LD b2, 1*SIZE(BO) # b1 image part
|
|
+ LD b3, 2*SIZE(BO) # b2 real part
|
|
+ LD b4, 3*SIZE(BO) # b2 image part
|
|
+
|
|
+ LD a1, 0*SIZE(AO) # a1 real part
|
|
+ LD a2, 1*SIZE(AO) # a1 image part
|
|
+
|
|
+ ble L, $L45
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$L42:
|
|
+ MAD a1,b1,c01,c01 # C11 real part
|
|
+ MAD a1,b2,c02,c02 # C11 imag part
|
|
+ MAD a1,b3,c03,c03 # C21 real part
|
|
+ MAD a1,b4,c04,c04 # C21 imag part
|
|
+
|
|
+ LD b5, 4 * SIZE(BO) # next B1R
|
|
+ LD b6, 5 * SIZE(BO) # next B1I
|
|
+ LD b7, 6 * SIZE(BO) # next B2R
|
|
+ LD b8, 7 * SIZE(BO) # next B2I
|
|
+
|
|
+ LD a5, 2 * SIZE(AO) # next A1-A4 real part
|
|
+ LD a6, 3 * SIZE(AO) # next A1-A4 image part
|
|
+
|
|
+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
|
|
+ MAD a2,b1,c05,c05 # C11 image part
|
|
+ MAD a2,b2,c06,c06 # C11 real part
|
|
+ MAD a2,b3,c07,c07 # C21 image part
|
|
+ MAD a2,b4,c08,c08 # C21 real part
|
|
+
|
|
+ subl L, 1, L #
|
|
+
|
|
+ addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE
|
|
+ MAD a5,b5,c01,c01
|
|
+ MAD a5,b6,c02,c02
|
|
+ MAD a5,b7,c03,c03
|
|
+ MAD a5,b8,c04,c04
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MAD a6,b5,c05,c05
|
|
+ MAD a6,b6,c06,c06
|
|
+ MAD a6,b7,c07,c07
|
|
+ MAD a6,b8,c08,c08
|
|
+
|
|
+ bne L, $L42 # continue K
|
|
+
|
|
+$L45:
|
|
+ LD alpha_r, ALPHA_R # $f30==b8
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc K, $L48 # if(K&1)
|
|
+#else
|
|
+ blbc TEMP, $L48
|
|
+#endif
|
|
+
|
|
+$L46:
|
|
+ addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE
|
|
+ MAD a1,b1,c01,c01 # C11 real part
|
|
+ MAD a1,b2,c02,c02 # C11 imag part
|
|
+ MAD a1,b3,c03,c03 # C21 real part
|
|
+ MAD a1,b4,c04,c04 # C21 imag part
|
|
+
|
|
+ addl BO, 4*SIZE, BO
|
|
+ MAD a2,b1,c05,c05 # C11 image part
|
|
+ MAD a2,b2,c06,c06 # C11 real part
|
|
+ MAD a2,b3,c07,c07 # C21 image part
|
|
+ MAD a2,b4,c08,c08 # C21 real part
|
|
+
|
|
+
|
|
+$L48: # Write back
|
|
+ LD alpha_i, ALPHA_I # $f29==b7
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD1 c01, c06, c01
|
|
+ ADD1 c03, c08, c03
|
|
+ ADD2 c05, c02, c02
|
|
+ ADD2 c07, c04, c04
|
|
+
|
|
+ LD b1, 0 * SIZE(C1)
|
|
+ LD b2, 1 * SIZE(C1)
|
|
+
|
|
+ LD a5, 0 * SIZE(C2)
|
|
+ LD a6, 1 * SIZE(C2)
|
|
+
|
|
+ FMAD5 c01, alpha_r, b1, b1
|
|
+ FMAD5 c03, alpha_r, a5, a5
|
|
+
|
|
+ FMAD7 c02, alpha_r, b2, b2
|
|
+ FMAD7 c04, alpha_r, a6, a6
|
|
+
|
|
+ FMAD8 c02, alpha_i, b1, b1
|
|
+ FMAD8 c04, alpha_i, a5, a5
|
|
+
|
|
+ FMAD6 c01, alpha_i, b2, b2
|
|
+ FMAD6 c03, alpha_i, a6, a6
|
|
+
|
|
+ ST b1, 0 * SIZE(C1)
|
|
+ ST b2, 1 * SIZE(C1)
|
|
+
|
|
+ ST a5, 0 * SIZE(C2)
|
|
+ ST a6, 1 * SIZE(C2)
|
|
+
|
|
+#else
|
|
+
|
|
+ ADD1 c01, c06, c01
|
|
+ ADD1 c03, c08, c03
|
|
+ ADD2 c05, c02, c02
|
|
+ ADD2 c07, c04, c04
|
|
+
|
|
+ FMAD5 c01, alpha_r, $f31, b1
|
|
+ FMAD5 c03, alpha_r, $f31, a5
|
|
+
|
|
+ FMAD7 c02, alpha_r, $f31, b2
|
|
+ FMAD7 c04, alpha_r, $f31, a6
|
|
+
|
|
+ FMAD8 c02, alpha_i, b1, b1
|
|
+ FMAD8 c04, alpha_i, a5, a5
|
|
+
|
|
+ FMAD6 c01, alpha_i, b2, b2
|
|
+ FMAD6 c03, alpha_i, a6, a6
|
|
+
|
|
+ ST b1, 0 * SIZE(C1)
|
|
+ ST b2, 1 * SIZE(C1)
|
|
+
|
|
+ ST a5, 0 * SIZE(C2)
|
|
+ ST a6, 1 * SIZE(C2)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 1, TEMP
|
|
+#else
|
|
+ subl TEMP, 2, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, ZBASE_SHIFT, L
|
|
+ sll TEMP, 1 + ZBASE_SHIFT, TEMP
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl BO, TEMP,BO
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ addl C1, 2*SIZE, C1
|
|
+ addl C2, 2*SIZE, C2
|
|
+
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L09:
|
|
+#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
+ addl KK, 2, KK # nr=2
|
|
+ nop
|
|
+#endif
|
|
+ mov BO, B # Change B to next panel
|
|
+ subl J, 1, J # J--
|
|
+ bgt J, $L01
|
|
+
|
|
+
|
|
+ .align 4
|
|
+$L50:
|
|
+ and N, 1, J
|
|
+ ble J, $L999 # Finish!
|
|
+
|
|
+#if defined(TRMMKERNEL) && defined(LEFT)
|
|
+ mov OFFSET, KK # reset KK
|
|
+#endif
|
|
+
|
|
+ sra M, 3, I # I=M/8
|
|
+ sll K, 1 + ZBASE_SHIFT, PREA
|
|
+
|
|
+ mov C, C1
|
|
+ mov A, AO # Reset A
|
|
+
|
|
+ addl A, PREA, PREA
|
|
+ beq I, $L60 # GEMM_MR=8
|
|
+
|
|
+
|
|
+$L51:
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA))\
|
|
+ || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B, BO
|
|
+#else
|
|
+ sll KK, 3 + ZBASE_SHIFT,L # mr=8
|
|
+ sll KK, ZBASE_SHIFT,TEMP # nr=1
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl B, TEMP, BO
|
|
+#endif
|
|
+
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+ fillcs 8*SIZE(C1)
|
|
+ fillcs 12*SIZE(C1)
|
|
+ fillcs 16*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c01 # Clear result regs
|
|
+ vcpys $f31,$f31,c02
|
|
+
|
|
+ vcpys $f31,$f31,c05
|
|
+ vcpys $f31,$f31,c06
|
|
+
|
|
+ vcpys $f31,$f31,c09
|
|
+ vcpys $f31,$f31,c10
|
|
+
|
|
+ vcpys $f31,$f31,c13
|
|
+ vcpys $f31,$f31,c14
|
|
+
|
|
+ LDDE b1, 0 * SIZE(BO) # B1R
|
|
+ LDDE b2, 1 * SIZE(BO) # B1I
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO) # A1, A2
|
|
+ VLD a2, 4 * SIZE(AO) # A3, A4
|
|
+ VLD a3, 8 * SIZE(AO) # A5, A6
|
|
+ VLD a4,12 * SIZE(AO) # A7, A8
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 8, TEMP # mr=8
|
|
+#else
|
|
+ addl KK, 1, TEMP # nr=1
|
|
+#endif
|
|
+ sra TEMP, 1, L
|
|
+ ble L, $L55
|
|
+
|
|
+#else
|
|
+ mov B, BO # Set B, (block A x panel Bj)
|
|
+ sra K, 1, L # Unroll K as 2
|
|
+
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+ fillcs 8*SIZE(C1)
|
|
+ fillcs 12*SIZE(C1)
|
|
+ fillcs 16*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c01 # Clear result regs
|
|
+ vcpys $f31,$f31,c02
|
|
+
|
|
+ vcpys $f31,$f31,c05
|
|
+ vcpys $f31,$f31,c06
|
|
+
|
|
+ vcpys $f31,$f31,c09
|
|
+ vcpys $f31,$f31,c10
|
|
+
|
|
+ vcpys $f31,$f31,c13
|
|
+ vcpys $f31,$f31,c14
|
|
+
|
|
+ LDDE b1, 0 * SIZE(BO) # B1R
|
|
+ LDDE b2, 1 * SIZE(BO) # B1I
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO) # A1, A2
|
|
+ VLD a2, 4 * SIZE(AO) # A3, A4
|
|
+ VLD a3, 8 * SIZE(AO) # A5, A6
|
|
+ VLD a4,12 * SIZE(AO) # A7, A8
|
|
+
|
|
+ ble L, $L55
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$L52:
|
|
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
|
|
+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc)
|
|
+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd)
|
|
+
|
|
+ LDDE b5, 2 * SIZE(BO) # next B1R
|
|
+ LDDE b6, 3 * SIZE(BO) # next B1I
|
|
+
|
|
+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE
|
|
+ VMAD a2,b1,c05,c05 # C31, C41
|
|
+ VMAD a2,b2,c06,c06 # C31, C41
|
|
+
|
|
+ VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0
|
|
+ VLD a6, 4 * SIZE(AO) # next A3, A4
|
|
+ VLD a7, 8 * SIZE(AO) # next A5, A6
|
|
+ VLD a8,12 * SIZE(AO) # next A7, A8
|
|
+
|
|
+ VMAD a3,b1,c09,c09 # C51, C61
|
|
+ VMAD a3,b2,c10,c10 # C51, C61
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ VMAD a4,b1,c13,c13 # C71, C81
|
|
+ VMAD a4,b2,c14,c14 # C71, C81
|
|
+
|
|
+ subl L, 1, L #
|
|
+
|
|
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
|
|
+ VMAD a5,b5,c01,c01
|
|
+ VMAD a5,b6,c02,c02
|
|
+
|
|
+ addl PREA, 16*SIZE, PREA
|
|
+ LDDE b1, 0 * SIZE(BO)
|
|
+ LDDE b2, 1 * SIZE(BO)
|
|
+
|
|
+ VMAD a6,b5,c05,c05
|
|
+ VMAD a6,b6,c06,c06
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO)
|
|
+ VLD a2, 4 * SIZE(AO)
|
|
+ VLD a3, 8 * SIZE(AO)
|
|
+ VLD a4,12 * SIZE(AO)
|
|
+
|
|
+ VMAD a7,b5,c09,c09
|
|
+ VMAD a7,b6,c10,c10
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ VMAD a8,b5,c13,c13
|
|
+ VMAD a8,b6,c14,c14
|
|
+
|
|
+ addl PREA, 16*SIZE, PREA
|
|
+ bne L, $L52 # continue K
|
|
+
|
|
+$L55:
|
|
+ LD alpha_r, ALPHA_R # $f30==b8
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc K, $L58 # if(K&1)
|
|
+#else
|
|
+ blbc TEMP, $L58
|
|
+#endif
|
|
+
|
|
+$L56:
|
|
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
|
|
+ VMAD a1,b1,c01,c01 # C11R C21R
|
|
+ VMAD a1,b2,c02,c02 # C11I C21I
|
|
+
|
|
+ addl BO, 2*SIZE, BO
|
|
+ VMAD a2,b1,c05,c05 # C31R C41R
|
|
+ VMAD a2,b2,c06,c06 # C31I C41I
|
|
+
|
|
+ VMAD a3,b1,c09,c09 # C51R C61R
|
|
+ VMAD a3,b2,c10,c10 # C51I C61I
|
|
+
|
|
+ VMAD a4,b1,c13,c13 # C71R C81R
|
|
+ VMAD a4,b2,c14,c14 # C71I C81I
|
|
+
|
|
+$L58: # Write back
|
|
+ LD alpha_i, ALPHA_I # $f29==b7
|
|
+#ifndef TRMMKERNEL
|
|
+ vextf c01, 0, a1 # a1=C11R_ac
|
|
+ vextf c01, 1, a2 # a2=C11I_bc
|
|
+ vextf c01, 2, a3 # a3=C21R_ac
|
|
+ vextf c01, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c02, 0, b1 # b1=C11I_ad
|
|
+ vextf c02, 1, b2 # b2=C11R_bd
|
|
+ vextf c02, 2, b3 # b3=C21I_ad
|
|
+ vextf c02, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 0 * SIZE(C1)
|
|
+ LD a2, 1 * SIZE(C1)
|
|
+ LD a3, 2 * SIZE(C1)
|
|
+ LD a4, 3 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 0 * SIZE(C1)
|
|
+ ST c01, 1 * SIZE(C1)
|
|
+ ST b6, 2 * SIZE(C1)
|
|
+ ST c02, 3 * SIZE(C1)
|
|
+
|
|
+ vextf c05, 0, a1 # a1=C11R_ac
|
|
+ vextf c05, 1, a2 # a2=C11I_bc
|
|
+ vextf c05, 2, a3 # a3=C21R_ac
|
|
+ vextf c05, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c06, 0, b1 # b1=C11I_ad
|
|
+ vextf c06, 1, b2 # b2=C11R_bd
|
|
+ vextf c06, 2, b3 # b3=C21I_ad
|
|
+ vextf c06, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 4 * SIZE(C1)
|
|
+ LD a2, 5 * SIZE(C1)
|
|
+ LD a3, 6 * SIZE(C1)
|
|
+ LD a4, 7 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 4 * SIZE(C1)
|
|
+ ST c01, 5 * SIZE(C1)
|
|
+ ST b6, 6 * SIZE(C1)
|
|
+ ST c02, 7 * SIZE(C1)
|
|
+
|
|
+ vextf c09, 0, a1 # a1=C11R_ac
|
|
+ vextf c09, 1, a2 # a2=C11I_bc
|
|
+ vextf c09, 2, a3 # a3=C21R_ac
|
|
+ vextf c09, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c10, 0, b1 # b1=C11I_ad
|
|
+ vextf c10, 1, b2 # b2=C11R_bd
|
|
+ vextf c10, 2, b3 # b3=C21I_ad
|
|
+ vextf c10, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 8 * SIZE(C1)
|
|
+ LD a2, 9 * SIZE(C1)
|
|
+ LD a3, 10 * SIZE(C1)
|
|
+ LD a4, 11 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 8 * SIZE(C1)
|
|
+ ST c01, 9 * SIZE(C1)
|
|
+ ST b6, 10 * SIZE(C1)
|
|
+ ST c02, 11 * SIZE(C1)
|
|
+
|
|
+ vextf c13, 0, a1 # a1=C11R_ac
|
|
+ vextf c13, 1, a2 # a2=C11I_bc
|
|
+ vextf c13, 2, a3 # a3=C21R_ac
|
|
+ vextf c13, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c14, 0, b1 # b1=C11I_ad
|
|
+ vextf c14, 1, b2 # b2=C11R_bd
|
|
+ vextf c14, 2, b3 # b3=C21I_ad
|
|
+ vextf c14, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 12 * SIZE(C1)
|
|
+ LD a2, 13 * SIZE(C1)
|
|
+ LD a3, 14 * SIZE(C1)
|
|
+ LD a4, 15 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 12 * SIZE(C1)
|
|
+ ST c01, 13 * SIZE(C1)
|
|
+ ST b6, 14 * SIZE(C1)
|
|
+ ST c02, 15 * SIZE(C1)
|
|
+
|
|
+#else
|
|
+
|
|
+ vextf c01, 0, a1 # a1=C11R_ac
|
|
+ vextf c01, 1, a2 # a2=C11I_bc
|
|
+ vextf c01, 2, a3 # a3=C21R_ac
|
|
+ vextf c01, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c02, 0, b1 # b1=C11I_ad
|
|
+ vextf c02, 1, b2 # b2=C11R_bd
|
|
+ vextf c02, 2, b3 # b3=C21I_ad
|
|
+ vextf c02, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 0 * SIZE(C1)
|
|
+ ST c01, 1 * SIZE(C1)
|
|
+ ST b6, 2 * SIZE(C1)
|
|
+ ST c02, 3 * SIZE(C1)
|
|
+
|
|
+ vextf c05, 0, a1 # a1=C11R_ac
|
|
+ vextf c05, 1, a2 # a2=C11I_bc
|
|
+ vextf c05, 2, a3 # a3=C21R_ac
|
|
+ vextf c05, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c06, 0, b1 # b1=C11I_ad
|
|
+ vextf c06, 1, b2 # b2=C11R_bd
|
|
+ vextf c06, 2, b3 # b3=C21I_ad
|
|
+ vextf c06, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 4 * SIZE(C1)
|
|
+ ST c01, 5 * SIZE(C1)
|
|
+ ST b6, 6 * SIZE(C1)
|
|
+ ST c02, 7 * SIZE(C1)
|
|
+
|
|
+ vextf c09, 0, a1 # a1=C11R_ac
|
|
+ vextf c09, 1, a2 # a2=C11I_bc
|
|
+ vextf c09, 2, a3 # a3=C21R_ac
|
|
+ vextf c09, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c10, 0, b1 # b1=C11I_ad
|
|
+ vextf c10, 1, b2 # b2=C11R_bd
|
|
+ vextf c10, 2, b3 # b3=C21I_ad
|
|
+ vextf c10, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 8 * SIZE(C1)
|
|
+ ST c01, 9 * SIZE(C1)
|
|
+ ST b6, 10 * SIZE(C1)
|
|
+ ST c02, 11 * SIZE(C1)
|
|
+
|
|
+ vextf c13, 0, a1 # a1=C11R_ac
|
|
+ vextf c13, 1, a2 # a2=C11I_bc
|
|
+ vextf c13, 2, a3 # a3=C21R_ac
|
|
+ vextf c13, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c14, 0, b1 # b1=C11I_ad
|
|
+ vextf c14, 1, b2 # b2=C11R_bd
|
|
+ vextf c14, 2, b3 # b3=C21I_ad
|
|
+ vextf c14, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 12 * SIZE(C1)
|
|
+ ST c01, 13 * SIZE(C1)
|
|
+ ST b6, 14 * SIZE(C1)
|
|
+ ST c02, 15 * SIZE(C1)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 8, TEMP
|
|
+#else
|
|
+ subl TEMP, 1, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 3 + ZBASE_SHIFT,L
|
|
+ sll TEMP, ZBASE_SHIFT,TEMP
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl BO, TEMP, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 8, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ jmp $L999
|
|
+
|
|
+
|
|
+ .align 4
|
|
+$L60:
|
|
+ and M, 4, I
|
|
+ ble I, $L70
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA))\
|
|
+ || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B, BO
|
|
+#else
|
|
+ sll KK, 2 + ZBASE_SHIFT,L # mr=4
|
|
+ sll KK, ZBASE_SHIFT,TEMP # nr=1
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl B, TEMP, BO
|
|
+#endif
|
|
+
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+ fillcs 8*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c01 # Clear result regs
|
|
+ vcpys $f31,$f31,c02
|
|
+
|
|
+ vcpys $f31,$f31,c05
|
|
+ vcpys $f31,$f31,c06
|
|
+
|
|
+ LDDE b1, 0 * SIZE(BO) # B1R
|
|
+ LDDE b2, 1 * SIZE(BO) # B1I
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO) # A1, A2
|
|
+ VLD a2, 4 * SIZE(AO) # A3, A4
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 4, TEMP # mr=4
|
|
+#else
|
|
+ addl KK, 1, TEMP # nr=1
|
|
+#endif
|
|
+ sra TEMP, 1, L
|
|
+ ble L, $L65
|
|
+
|
|
+#else
|
|
+
|
|
+ mov B, BO # Set B, (block A x panel Bj)
|
|
+ sra K, 1, L # Unroll K as 2
|
|
+
|
|
+ fillcs 0(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+ fillcs 8*SIZE(C1)
|
|
+
|
|
+ vcpys $f31,$f31,c01 # Clear result regs
|
|
+ vcpys $f31,$f31,c02
|
|
+
|
|
+ vcpys $f31,$f31,c05
|
|
+ vcpys $f31,$f31,c06
|
|
+
|
|
+ LDDE b1, 0 * SIZE(BO) # B1R
|
|
+ LDDE b2, 1 * SIZE(BO) # B1I
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO) # A1, A2
|
|
+ VLD a2, 4 * SIZE(AO) # A3, A4
|
|
+
|
|
+ ble L, $L65
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$L62:
|
|
+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc)
|
|
+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd)
|
|
+
|
|
+ LDDE b5, 2 * SIZE(BO) # next B1R
|
|
+ LDDE b6, 3 * SIZE(BO) # next B1I
|
|
+
|
|
+ addl BO, 4*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
|
|
+ VMAD a2,b1,c05,c05 # C31, C41
|
|
+ VMAD a2,b2,c06,c06 # C31, C41
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0
|
|
+ VLD a6, 12 * SIZE(AO) # next A3, A4
|
|
+
|
|
+ subl L, 1, L #
|
|
+
|
|
+ addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE
|
|
+ VMAD a5,b5,c01,c01
|
|
+ VMAD a5,b6,c02,c02
|
|
+
|
|
+ addl PREA, 16*SIZE, PREA
|
|
+ LDDE b1, 0 * SIZE(BO)
|
|
+ LDDE b2, 1 * SIZE(BO)
|
|
+
|
|
+ fillcs 0(PREA)
|
|
+ VMAD a6,b5,c05,c05
|
|
+ VMAD a6,b6,c06,c06
|
|
+
|
|
+ VLD a1, 0 * SIZE(AO)
|
|
+ VLD a2, 4 * SIZE(AO)
|
|
+
|
|
+ addl PREA, 16*SIZE, PREA
|
|
+ bne L, $L62 # continue K
|
|
+
|
|
+$L65:
|
|
+ LD alpha_r, ALPHA_R # $f30==b8
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc K, $L68 # if(K&1)
|
|
+#else
|
|
+ blbc TEMP, $L68
|
|
+#endif
|
|
+
|
|
+$L66:
|
|
+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE
|
|
+ VMAD a1,b1,c01,c01 # C11R C21R
|
|
+ VMAD a1,b2,c02,c02 # C11I C21I
|
|
+
|
|
+ addl BO, 2*SIZE, BO
|
|
+ VMAD a2,b1,c05,c05 # C31R C41R
|
|
+ VMAD a2,b2,c06,c06 # C31I C41I
|
|
+
|
|
+$L68: # Write back
|
|
+ LD alpha_i, ALPHA_I # $f29==b7
|
|
+#ifndef TRMMKERNEL
|
|
+ vextf c01, 0, a1 # a1=C11R_ac
|
|
+ vextf c01, 1, a2 # a2=C11I_bc
|
|
+ vextf c01, 2, a3 # a3=C21R_ac
|
|
+ vextf c01, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c02, 0, b1 # b1=C11I_ad
|
|
+ vextf c02, 1, b2 # b2=C11R_bd
|
|
+ vextf c02, 2, b3 # b3=C21I_ad
|
|
+ vextf c02, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 0 * SIZE(C1)
|
|
+ LD a2, 1 * SIZE(C1)
|
|
+ LD a3, 2 * SIZE(C1)
|
|
+ LD a4, 3 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 0 * SIZE(C1)
|
|
+ ST c01, 1 * SIZE(C1)
|
|
+ ST b6, 2 * SIZE(C1)
|
|
+ ST c02, 3 * SIZE(C1)
|
|
+
|
|
+ vextf c05, 0, a1 # a1=C11R_ac
|
|
+ vextf c05, 1, a2 # a2=C11I_bc
|
|
+ vextf c05, 2, a3 # a3=C21R_ac
|
|
+ vextf c05, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c06, 0, b1 # b1=C11I_ad
|
|
+ vextf c06, 1, b2 # b2=C11R_bd
|
|
+ vextf c06, 2, b3 # b3=C21I_ad
|
|
+ vextf c06, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ LD a1, 4 * SIZE(C1)
|
|
+ LD a2, 5 * SIZE(C1)
|
|
+ LD a3, 6 * SIZE(C1)
|
|
+ LD a4, 7 * SIZE(C1)
|
|
+
|
|
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, a3, a3
|
|
+ FMAD7 a7, alpha_r, a2, a2
|
|
+ FMAD7 a8, alpha_r, a4, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 4 * SIZE(C1)
|
|
+ ST c01, 5 * SIZE(C1)
|
|
+ ST b6, 6 * SIZE(C1)
|
|
+ ST c02, 7 * SIZE(C1)
|
|
+
|
|
+#else
|
|
+
|
|
+ vextf c01, 0, a1 # a1=C11R_ac
|
|
+ vextf c01, 1, a2 # a2=C11I_bc
|
|
+ vextf c01, 2, a3 # a3=C21R_ac
|
|
+ vextf c01, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c02, 0, b1 # b1=C11I_ad
|
|
+ vextf c02, 1, b2 # b2=C11R_bd
|
|
+ vextf c02, 2, b3 # b3=C21I_ad
|
|
+ vextf c02, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 0 * SIZE(C1)
|
|
+ ST c01, 1 * SIZE(C1)
|
|
+ ST b6, 2 * SIZE(C1)
|
|
+ ST c02, 3 * SIZE(C1)
|
|
+
|
|
+ vextf c05, 0, a1 # a1=C11R_ac
|
|
+ vextf c05, 1, a2 # a2=C11I_bc
|
|
+ vextf c05, 2, a3 # a3=C21R_ac
|
|
+ vextf c05, 3, a4 # a4=C21I_bc
|
|
+
|
|
+ vextf c06, 0, b1 # b1=C11I_ad
|
|
+ vextf c06, 1, b2 # b2=C11R_bd
|
|
+ vextf c06, 2, b3 # b3=C21I_ad
|
|
+ vextf c06, 3, b4 # b4=C21R_bd
|
|
+
|
|
+ ADD1 a1, b2, b5 # ac '+' bd
|
|
+ ADD1 a3, b4, a6
|
|
+ ADD2 a2, b1, a7 # bc '+' ad
|
|
+ ADD2 a4, b3, a8
|
|
+
|
|
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
|
|
+ FMAD5 a6, alpha_r, $f31, a3
|
|
+ FMAD7 a7, alpha_r, $f31, a2
|
|
+ FMAD7 a8, alpha_r, $f31, a4
|
|
+
|
|
+ FMAD8 a7, alpha_i, a1, b4
|
|
+ FMAD8 a8, alpha_i, a3, b6
|
|
+ FMAD6 b5, alpha_i, a2, c01
|
|
+ FMAD6 a6, alpha_i, a4, c02
|
|
+
|
|
+ ST b4, 4 * SIZE(C1)
|
|
+ ST c01, 5 * SIZE(C1)
|
|
+ ST b6, 6 * SIZE(C1)
|
|
+ ST c02, 7 * SIZE(C1)
|
|
+
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK,TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 4, TEMP # mr=4
|
|
+#else
|
|
+ subl TEMP, 1, TEMP # nr=1
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 2 + ZBASE_SHIFT, L
|
|
+ sll TEMP, ZBASE_SHIFT,TEMP
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl BO,TEMP, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK,4,KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ addl C1, 8*SIZE, C1
|
|
+
|
|
+
|
|
+ .align 4
|
|
+$L70:
|
|
+ and M, 2, I # I=M&2
|
|
+ ble I, $L80
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B, BO
|
|
+ nop
|
|
+#else
|
|
+ sll KK, 1 + ZBASE_SHIFT, L # mr=2
|
|
+ sll KK, ZBASE_SHIFT,TEMP # nr=1
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl B, TEMP, BO
|
|
+#endif
|
|
+
|
|
+ fillcs 0*SIZE(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+
|
|
+ fclr c01
|
|
+ fclr c02 # CLEAR 8 register
|
|
+ fclr c03
|
|
+ fclr c04
|
|
+ fclr c05
|
|
+ fclr c06
|
|
+ fclr c07
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0*SIZE(BO) # b1 real part
|
|
+ LD b2, 1*SIZE(BO) # b1 image part
|
|
+
|
|
+ LD a1, 0*SIZE(AO) # a1 real part
|
|
+ LD a2, 1*SIZE(AO) # a1 image part
|
|
+ LD a3, 2*SIZE(AO) # a2 real part
|
|
+ LD a4, 3*SIZE(AO) # a2 image part
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 2, TEMP # mr=2
|
|
+#else
|
|
+ addl KK, 1, TEMP # nr=1
|
|
+#endif
|
|
+ sra TEMP, 1, L
|
|
+ ble L, $L75
|
|
+
|
|
+#else
|
|
+ mov B, BO # Set B, (block A x panel Bj)
|
|
+ sra K, 1, L # Unroll K as 2
|
|
+
|
|
+ fillcs 0*SIZE(C1)
|
|
+ fillcs 4*SIZE(C1)
|
|
+
|
|
+ fclr c01
|
|
+ fclr c02 # CLEAR 8 register
|
|
+ fclr c03
|
|
+ fclr c04
|
|
+ fclr c05
|
|
+ fclr c06
|
|
+ fclr c07
|
|
+ fclr c08
|
|
+
|
|
+ LD b1, 0*SIZE(BO) # b1 real part
|
|
+ LD b2, 1*SIZE(BO) # b1 image part
|
|
+
|
|
+ LD a1, 0*SIZE(AO) # a1 real part
|
|
+ LD a2, 1*SIZE(AO) # a1 image part
|
|
+ LD a3, 2*SIZE(AO) # a2 real part
|
|
+ LD a4, 3*SIZE(AO) # a2 image part
|
|
+
|
|
+ ble L, $L75
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$L72:
|
|
+ MAD a1,b1,c01,c01 # C11 real part
|
|
+ MAD a1,b2,c02,c02 # C11 imag part
|
|
+
|
|
+ LD b5, 2 * SIZE(BO) # next B1R
|
|
+ LD b6, 3 * SIZE(BO) # next B1I
|
|
+
|
|
+ LD a5, 4 * SIZE(AO) # next A1-A4 real part
|
|
+ LD a6, 5 * SIZE(AO) # next A1-A4 image part
|
|
+ LD a7, 6 * SIZE(AO)
|
|
+ LD a8, 7 * SIZE(AO)
|
|
+
|
|
+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE
|
|
+ MAD a2,b1,c03,c03 # C11 image part
|
|
+ MAD a2,b2,c04,c04 # C11 real part
|
|
+
|
|
+ MAD a3,b1,c05,c05 # C12 real part
|
|
+ MAD a3,b2,c06,c06 # C12 imag part
|
|
+
|
|
+ MAD a4,b1,c07,c07 # C12 image part
|
|
+ MAD a4,b2,c08,c08 # C12 real part
|
|
+
|
|
+ subl L, 1, L #
|
|
+
|
|
+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE
|
|
+ MAD a5,b5,c01,c01
|
|
+ MAD a5,b6,c02,c02
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MAD a6,b5,c03,c03
|
|
+ MAD a6,b6,c04,c04
|
|
+
|
|
+ MAD a7,b5,c05,c05
|
|
+ MAD a7,b6,c06,c06
|
|
+
|
|
+ MAD a8,b5,c07,c07
|
|
+ MAD a8,b6,c08,c08
|
|
+
|
|
+ bne L, $L72 # continue K
|
|
+
|
|
+$L75:
|
|
+ LD alpha_r, ALPHA_R # $f30==b8
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc K, $L78 # if(K&1)
|
|
+#else
|
|
+ blbc TEMP, $L78
|
|
+#endif
|
|
+
|
|
+$L76:
|
|
+ addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE
|
|
+ MAD a1,b1,c01,c01 # C11 real part
|
|
+ MAD a1,b2,c02,c02 # C11 imag part
|
|
+
|
|
+ addl BO, 4*SIZE, BO
|
|
+ MAD a2,b1,c03,c03 # C11 image part
|
|
+ MAD a2,b2,c04,c04 # C11 real part
|
|
+
|
|
+ MAD a3,b1,c05,c05 # C12 real part
|
|
+ MAD a3,b2,c06,c06 # C12 imag part
|
|
+
|
|
+ MAD a4,b1,c07,c07 # C12 image part
|
|
+ MAD a4,b2,c08,c08 # C12 real part
|
|
+
|
|
+
|
|
+
|
|
+$L78: # Write back
|
|
+ LD alpha_i, ALPHA_I # $f29==b7
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD1 c01, c04, c01
|
|
+ ADD1 c05, c08, c05
|
|
+ ADD2 c03, c02, c02
|
|
+ ADD2 c07, c06, c06
|
|
+
|
|
+ LD b1, 0 * SIZE(C1)
|
|
+ LD b2, 1 * SIZE(C1)
|
|
+ LD b3, 2 * SIZE(C1)
|
|
+ LD b4, 3 * SIZE(C1)
|
|
+
|
|
+ FMAD5 c01, alpha_r, b1, b1
|
|
+ FMAD5 c05, alpha_r, b3, b3
|
|
+ FMAD7 c02, alpha_r, b2, b2
|
|
+ FMAD7 c06, alpha_r, b4, b4
|
|
+
|
|
+ FMAD8 c02, alpha_i, b1, b1
|
|
+ FMAD8 c06, alpha_i, b3, b3
|
|
+ FMAD6 c01, alpha_i, b2, b2
|
|
+ FMAD6 c05, alpha_i, b4, b4
|
|
+
|
|
+ ST b1, 0 * SIZE(C1)
|
|
+ ST b2, 1 * SIZE(C1)
|
|
+ ST b3, 2 * SIZE(C1)
|
|
+ ST b4, 3 * SIZE(C1)
|
|
+
|
|
+#else
|
|
+
|
|
+ ADD1 c01, c04, c01
|
|
+ ADD1 c05, c08, c05
|
|
+ ADD2 c03, c02, c02
|
|
+ ADD2 c07, c06, c06
|
|
+
|
|
+ FMAD5 c01, alpha_r, $f31, b1
|
|
+ FMAD5 c05, alpha_r, $f31, b3
|
|
+ FMAD7 c02, alpha_r, $f31, b2
|
|
+ FMAD7 c06, alpha_r, $f31, b4
|
|
+
|
|
+ FMAD8 c02, alpha_i, b1, b1
|
|
+ FMAD8 c06, alpha_i, b3, b3
|
|
+ FMAD6 c01, alpha_i, b2, b2
|
|
+ FMAD6 c05, alpha_i, b4, b4
|
|
+
|
|
+ ST b1, 0 * SIZE(C1)
|
|
+ ST b2, 1 * SIZE(C1)
|
|
+ ST b3, 2 * SIZE(C1)
|
|
+ ST b4, 3 * SIZE(C1)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 2, TEMP
|
|
+#else
|
|
+ subl TEMP, 1, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, 1 + ZBASE_SHIFT, L
|
|
+ sll TEMP, ZBASE_SHIFT, TEMP
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl BO, TEMP, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ addl C1, 4*SIZE, C1
|
|
+
|
|
+
|
|
+ .align 4
|
|
+$L80:
|
|
+ and M, 1, I # I=M&1
|
|
+ ble I, $L999
|
|
+
|
|
+#if defined(TRMMKERNEL)
|
|
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
+ mov B, BO
|
|
+ nop
|
|
+#else
|
|
+ sll KK, ZBASE_SHIFT, L # mr=1
|
|
+ sll KK, ZBASE_SHIFT,TEMP # nr=1
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl B, TEMP, BO
|
|
+#endif
|
|
+
|
|
+ fillcs 0*SIZE(C1)
|
|
+
|
|
+ fclr c01 # CLEAR 8 register
|
|
+ fclr c02
|
|
+ fclr c03
|
|
+ fclr c04
|
|
+
|
|
+ LD b1, 0*SIZE(BO) # b1 real part
|
|
+ LD b2, 1*SIZE(BO) # b1 image part
|
|
+
|
|
+ LD a1, 0*SIZE(AO) # a1 real part
|
|
+ LD a2, 1*SIZE(AO) # a1 image part
|
|
+
|
|
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#elif defined(LEFT)
|
|
+ addl KK, 1, TEMP # mr=1
|
|
+#else
|
|
+ addl KK, 1, TEMP # nr=1
|
|
+#endif
|
|
+ sra TEMP, 1, L
|
|
+ ble L, $L85
|
|
+
|
|
+#else
|
|
+ mov B, BO # Set B, (block A x panel Bj)
|
|
+ sra K, 1, L # Unroll K as 2
|
|
+
|
|
+ fillcs 0*SIZE(C1)
|
|
+
|
|
+ fclr c01 # CLEAR 8 register
|
|
+ fclr c02
|
|
+ fclr c03
|
|
+ fclr c04
|
|
+
|
|
+ LD b1, 0*SIZE(BO) # b1 real part
|
|
+ LD b2, 1*SIZE(BO) # b1 image part
|
|
+
|
|
+ LD a1, 0*SIZE(AO) # a1 real part
|
|
+ LD a2, 1*SIZE(AO) # a1 image part
|
|
+
|
|
+ ble L, $L85
|
|
+#endif
|
|
+
|
|
+ .align 4
|
|
+$L82:
|
|
+ MAD a1,b1,c01,c01 # C11 real part
|
|
+ MAD a1,b2,c02,c02 # C11 imag part
|
|
+
|
|
+ LD b5, 2 * SIZE(BO) # next B1R
|
|
+ LD b6, 3 * SIZE(BO) # next B1I
|
|
+
|
|
+ LD a5, 2 * SIZE(AO) # next A1-A4 real part
|
|
+ LD a6, 3 * SIZE(AO) # next A1-A4 image part
|
|
+
|
|
+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE
|
|
+ MAD a2,b1,c03,c03 # C11 image part
|
|
+ MAD a2,b2,c04,c04 # C11 real part
|
|
+
|
|
+ subl L, 1, L #
|
|
+
|
|
+ addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE
|
|
+ MAD a5,b5,c01,c01
|
|
+ MAD a5,b6,c02,c02
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MAD a6,b5,c03,c03
|
|
+ MAD a6,b6,c04,c04
|
|
+
|
|
+ bne L, $L82 # continue K
|
|
+
|
|
+$L85:
|
|
+ LD alpha_r, ALPHA_R # $f30==b8
|
|
+#ifndef TRMMKERNEL
|
|
+ blbc K, $L88 # if(K&1)
|
|
+#else
|
|
+ blbc TEMP, $L88
|
|
+#endif
|
|
+
|
|
+$L86:
|
|
+ addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE
|
|
+ MAD a1,b1,c01,c01 # C11 real part
|
|
+ MAD a1,b2,c02,c02 # C11 imag part
|
|
+
|
|
+ addl BO, 2*SIZE, BO
|
|
+ MAD a2,b1,c03,c03 # C11 image part
|
|
+ MAD a2,b2,c04,c04 # C11 real part
|
|
+
|
|
+$L88: # Write back
|
|
+ LD alpha_i, ALPHA_I # $f29==b7
|
|
+#ifndef TRMMKERNEL
|
|
+ ADD1 c01, c04, c01
|
|
+ ADD2 c03, c02, c02
|
|
+
|
|
+ LD b1, 0 * SIZE(C1)
|
|
+ LD b2, 1 * SIZE(C1)
|
|
+
|
|
+ FMAD5 c01, alpha_r, b1, b1
|
|
+ FMAD7 c02, alpha_r, b2, b2
|
|
+ FMAD8 c02, alpha_i, b1, b1
|
|
+ FMAD6 c01, alpha_i, b2, b2
|
|
+
|
|
+ ST b1, 0 * SIZE(C1)
|
|
+ ST b2, 1 * SIZE(C1)
|
|
+
|
|
+#else
|
|
+
|
|
+ ADD1 c01, c04, c01
|
|
+ ADD2 c03, c02, c02
|
|
+
|
|
+ FMAD5 c01, alpha_r, $f31, b1
|
|
+ FMAD7 c02, alpha_r, $f31, b2
|
|
+
|
|
+ FMAD8 c02, alpha_i, b1, b1
|
|
+ FMAD6 c01, alpha_i, b2, b2
|
|
+
|
|
+ ST b1, 0 * SIZE(C1)
|
|
+ ST b2, 1 * SIZE(C1)
|
|
+
|
|
+#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
+ (!defined(LEFT) && !defined(TRANSA))
|
|
+ subl K, KK, TEMP
|
|
+#ifdef LEFT
|
|
+ subl TEMP, 1, TEMP
|
|
+#else
|
|
+ subl TEMP, 1, TEMP
|
|
+#endif
|
|
+
|
|
+ sll TEMP, ZBASE_SHIFT, L
|
|
+ sll TEMP, ZBASE_SHIFT, TEMP
|
|
+
|
|
+ addl AO, L, AO
|
|
+ addl BO, TEMP,BO
|
|
+#endif
|
|
+
|
|
+#ifdef LEFT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ addl C1, 2*SIZE, C1
|
|
+
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl $9, 80($sp)
|
|
+ ldl $10,88($sp)
|
|
+ ldl $11,96($sp)
|
|
+ ldl $12,104($sp)
|
|
+ ldl $13,112($sp)
|
|
+ ldl $14,120($sp)
|
|
+
|
|
+ clr $0
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret $31,($26),1 #
|
|
+
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zgemv_n.S b/kernel/sw_64/zgemv_n.S
|
|
new file mode 100644
|
|
index 0000000..03d71ee
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zgemv_n.S
|
|
@@ -0,0 +1,1040 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define STACKSIZE 64
|
|
+#define PREFETCHSIZE 32
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define A $21
|
|
+#define LDA $18
|
|
+
|
|
+#define X $19
|
|
+#define INCX $20
|
|
+#define Y $22
|
|
+#define INCY $23
|
|
+
|
|
+#define BUFFER $24
|
|
+
|
|
+#define I $25
|
|
+#define J $27
|
|
+
|
|
+#define Y1 $4
|
|
+#define A1 $5
|
|
+#define A2 $6
|
|
+
|
|
+#define alpha_r $f19
|
|
+#define alpha_i $f20
|
|
+
|
|
+#define alpha1 $f0
|
|
+#define alpha2 $f1
|
|
+#define alpha3 $f10
|
|
+#define alpha4 $f11
|
|
+
|
|
+#define y0 $f12
|
|
+#define y1 $f13
|
|
+#define y2 $f14
|
|
+#define y3 $f15
|
|
+
|
|
+#define y4 $f16
|
|
+#define y5 $f17
|
|
+#define y6 $f18
|
|
+#define y7 $f21
|
|
+
|
|
+#define a0 $f22
|
|
+#define a1 $f23
|
|
+#define a2 $f24
|
|
+#define a3 $f25
|
|
+#define a4 $f26
|
|
+#define a5 $f27
|
|
+#define a6 $f28
|
|
+#define a7 $f29
|
|
+
|
|
+#define t0 $f2
|
|
+#define t1 $f3
|
|
+#define t2 $f4
|
|
+#define t3 $f5
|
|
+
|
|
+#if !defined(CONJ) && !defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#elif defined(CONJ) && !defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#elif !defined(CONJ) && defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 SUB
|
|
+#define ADD4 SUB
|
|
+#endif
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ ldl LDA, 0 + STACKSIZE($sp)
|
|
+ ldl X, 8 + STACKSIZE($sp)
|
|
+ ldl INCX, 16 + STACKSIZE($sp)
|
|
+ ldl Y, 24 + STACKSIZE($sp)
|
|
+ ldl INCY, 32 + STACKSIZE($sp)
|
|
+ ldl BUFFER, 40 + STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ cmple N, 0, $1
|
|
+ sll INCY, ZBASE_SHIFT, INCY
|
|
+
|
|
+ or $0, $1, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ cmpeq INCY, 2 * SIZE, $0
|
|
+ sll LDA, ZBASE_SHIFT,LDA
|
|
+ bne $0, $L10
|
|
+
|
|
+ mov BUFFER, Y1
|
|
+
|
|
+ mov Y, BUFFER
|
|
+ mov Y1, Y
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L05
|
|
+ .align 4
|
|
+
|
|
+$L02:
|
|
+ ST $f31, 0 * SIZE(Y1)
|
|
+ ST $f31, 1 * SIZE(Y1)
|
|
+ ST $f31, 2 * SIZE(Y1)
|
|
+ ST $f31, 3 * SIZE(Y1)
|
|
+ ST $f31, 4 * SIZE(Y1)
|
|
+ ST $f31, 5 * SIZE(Y1)
|
|
+ ST $f31, 6 * SIZE(Y1)
|
|
+ ST $f31, 7 * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L02
|
|
+ .align 4
|
|
+
|
|
+$L05:
|
|
+ and M, 3, I
|
|
+ ble I, $L10
|
|
+ .align 4
|
|
+
|
|
+$L06:
|
|
+ ST $f31, 0 * SIZE(Y1)
|
|
+ ST $f31, 1 * SIZE(Y1)
|
|
+ addl Y1, 2 * SIZE, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L06
|
|
+ .align 4
|
|
+
|
|
+$L10:
|
|
+ sra N, 1, J
|
|
+ ble J, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ LD alpha2, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha3, 0 * SIZE(X)
|
|
+ LD alpha4, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ MUL alpha_r, alpha1, y0
|
|
+ MUL alpha_r, alpha2, y1
|
|
+ MUL alpha_r, alpha3, y2
|
|
+ MUL alpha_r, alpha4, y3
|
|
+
|
|
+ MUL alpha_i, alpha2, t0
|
|
+ mov A, A1
|
|
+ MUL alpha_i, alpha1, t1
|
|
+ addl A, LDA, A2
|
|
+ MUL alpha_i, alpha4, t2
|
|
+ addl A2, LDA, A
|
|
+ MUL alpha_i, alpha3, t3
|
|
+ mov Y, Y1
|
|
+
|
|
+#ifndef XCONJ
|
|
+ SUB y0, t0, alpha1
|
|
+ ADD y1, t1, alpha2
|
|
+ SUB y2, t2, alpha3
|
|
+ ADD y3, t3, alpha4
|
|
+#else
|
|
+ ADD y0, t0, alpha1
|
|
+ SUB y1, t1, alpha2
|
|
+ ADD y2, t2, alpha3
|
|
+ SUB y3, t3, alpha4
|
|
+#endif
|
|
+
|
|
+ fillcs 4 * SIZE(X)
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, $f6
|
|
+ unop
|
|
+ MUL alpha3, a4, t0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+
|
|
+ ADD2 y1, t1, $f7
|
|
+ unop
|
|
+ MUL alpha3, a5, t1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y2, t2, $f8
|
|
+ unop
|
|
+ MUL alpha3, a6, t2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+
|
|
+ ADD2 y3, t3, $f9
|
|
+ unop
|
|
+ MUL alpha3, a7, t3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ ADD1 $f6, t0, y0
|
|
+ unop
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 5 * SIZE(A1)
|
|
+
|
|
+ ADD2 $f7, t1, y1
|
|
+ unop
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+
|
|
+ ADD1 $f8, t2, y2
|
|
+ unop
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 7 * SIZE(A1)
|
|
+
|
|
+ ADD2 $f9, t3, y3
|
|
+ unop
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 6 * SIZE(A1)
|
|
+
|
|
+ ADD3 y0, t0, $f6
|
|
+ unop
|
|
+ MUL alpha4, a5, t0
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+
|
|
+ ADD4 y1, t1, $f7
|
|
+ unop
|
|
+ MUL alpha4, a4, t1
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+
|
|
+ ADD3 y2, t2, $f8
|
|
+ unop
|
|
+ MUL alpha4, a7, t2
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+
|
|
+ ADD4 y3, t3, $f9
|
|
+ unop
|
|
+ MUL alpha4, a6, t3
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+
|
|
+ ADD3 $f6, t0, y0
|
|
+ MUL alpha1, a0, t0
|
|
+ ADD4 $f7, t1, y1
|
|
+ MUL alpha1, a1, t1
|
|
+
|
|
+ ADD3 $f8, t2, y2
|
|
+ unop
|
|
+ MUL alpha1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 $f9, t3, y3
|
|
+ ldi I, -1(I)
|
|
+ MUL alpha1, a3, t3
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD1 y4, t0, $f6
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha3, a4, t0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+
|
|
+ ADD2 y5, t1, $f7
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha3, a5, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD1 y6, t2, $f8
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha3, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 y7, t3, $f9
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha3, a7, t3
|
|
+ unop
|
|
+
|
|
+ ADD1 $f6, t0, y4
|
|
+ unop
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD2 $f7, t1, y5
|
|
+ unop
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD1 $f8, t2, y6
|
|
+ unop
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD2 $f9, t3, y7
|
|
+ unop
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD3 y4, t0, $f6
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+ MUL alpha4, a5, t0
|
|
+ LD a5, 9 * SIZE(A2)
|
|
+
|
|
+ ADD4 y5, t1, $f7
|
|
+ unop
|
|
+ MUL alpha4, a4, t1
|
|
+ LD a4, 8 * SIZE(A2)
|
|
+
|
|
+ ADD3 y6, t2, $f8
|
|
+ unop
|
|
+ MUL alpha4, a7, t2
|
|
+ LD a7, 11 * SIZE(A2)
|
|
+
|
|
+ ADD4 y7, t3, $f9
|
|
+ unop
|
|
+ MUL alpha4, a6, t3
|
|
+ LD a6, 10 * SIZE(A2)
|
|
+
|
|
+ ADD3 $f6, t0, y4
|
|
+ unop
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD4 $f7, t1, y5
|
|
+ unop
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+
|
|
+ ADD3 $f8, t2, y6
|
|
+ unop
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+
|
|
+ ADD4 $f9, t3, y7
|
|
+ unop
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, $f6
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ MUL alpha3, a4, t0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+
|
|
+ ADD2 y1, t1, $f7
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ MUL alpha3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD1 y2, t2, $f8
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ MUL alpha3, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 y3, t3, $f9
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ MUL alpha3, a7, t3
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD1 $f6, t0, y0
|
|
+ unop
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 13 * SIZE(A1)
|
|
+
|
|
+ ADD2 $f7, t1, y1
|
|
+ unop
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 12 * SIZE(A1)
|
|
+
|
|
+ ADD1 $f8, t2, y2
|
|
+ unop
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 15 * SIZE(A1)
|
|
+
|
|
+ ADD2 $f9, t3, y3
|
|
+ unop
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 14 * SIZE(A1)
|
|
+
|
|
+ ADD3 y0, t0, $f6
|
|
+ unop
|
|
+ MUL alpha4, a5, t0
|
|
+ LD a5, 13 * SIZE(A2)
|
|
+
|
|
+ ADD4 y1, t1, $f7
|
|
+ unop
|
|
+ MUL alpha4, a4, t1
|
|
+ LD a4, 12 * SIZE(A2)
|
|
+
|
|
+ ADD3 y2, t2, $f8
|
|
+ unop
|
|
+ MUL alpha4, a7, t2
|
|
+ LD a7, 15 * SIZE(A2)
|
|
+
|
|
+ ADD4 y3, t3, $f9
|
|
+ unop
|
|
+ MUL alpha4, a6, t3
|
|
+ LD a6, 14 * SIZE(A2)
|
|
+
|
|
+ ADD3 $f6, t0, y0
|
|
+ unop
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+
|
|
+ ADD4 $f7, t1, y1
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+
|
|
+ ADD3 $f8, t2, y2
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+
|
|
+ ADD4 $f9, t3, y3
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD1 y4, t0, $f6
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha3, a4, t0
|
|
+ unop
|
|
+
|
|
+ ADD2 y5, t1, $f7
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD1 y6, t2, $f8
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha3, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 y7, t3, $f9
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha3, a7, t3
|
|
+ unop
|
|
+
|
|
+ ADD1 $f6, t0, y4
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 $f7, t1, y5
|
|
+ MUL alpha2, a0, t1
|
|
+
|
|
+ ADD1 $f8, t2, y6
|
|
+ MUL alpha2, a3, t2
|
|
+ ADD2 $f9, t3, y7
|
|
+ MUL alpha2, a2, t3
|
|
+
|
|
+ ADD3 y4, t0, $f6
|
|
+ MUL alpha4, a5, t0
|
|
+ ADD4 y5, t1, $f7
|
|
+ MUL alpha4, a4, t1
|
|
+
|
|
+ ADD3 y6, t2, $f8
|
|
+ MUL alpha4, a7, t2
|
|
+ ADD4 y7, t3, $f9
|
|
+ MUL alpha4, a6, t3
|
|
+
|
|
+ ADD3 $f6, t0, y4
|
|
+ ADD4 $f7, t1, y5
|
|
+ ADD3 $f8, t2, y6
|
|
+ ADD4 $f9, t3, y7
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and M, 2, I
|
|
+ ble I, $L17
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, $f6
|
|
+ MUL alpha3, a4, t0
|
|
+ ADD2 y1, t1, $f7
|
|
+ MUL alpha3, a5, t1
|
|
+ ADD1 y2, t2, $f8
|
|
+ MUL alpha3, a6, t2
|
|
+ ADD2 y3, t3, $f9
|
|
+ MUL alpha3, a7, t3
|
|
+
|
|
+ ADD1 $f6, t0, y0
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 $f7, t1, y1
|
|
+ MUL alpha2, a0, t1
|
|
+
|
|
+ ADD1 $f8, t2, y2
|
|
+ MUL alpha2, a3, t2
|
|
+ ADD2 $f9, t3, y3
|
|
+ MUL alpha2, a2, t3
|
|
+
|
|
+ ADD3 y0, t0, $f6
|
|
+ MUL alpha4, a5, t0
|
|
+ ADD4 y1, t1, $f7
|
|
+ MUL alpha4, a4, t1
|
|
+
|
|
+ ADD3 y2, t2, $f8
|
|
+ MUL alpha4, a7, t2
|
|
+ ADD4 y3, t3, $f9
|
|
+ MUL alpha4, a6, t3
|
|
+
|
|
+ ADD3 $f6, t0, y0
|
|
+ ADD4 $f7, t1, y1
|
|
+ ADD3 $f8, t2, y2
|
|
+ ADD4 $f9, t3, y3
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi A2, 4 * SIZE(A2)
|
|
+
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ blbc M, $L18
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ MUL alpha1, a1, t1
|
|
+
|
|
+ ADD1 y0, t0, $f6
|
|
+ MUL alpha3, a2, t0
|
|
+ ADD2 y1, t1, $f7
|
|
+ MUL alpha3, a3, t1
|
|
+
|
|
+ ADD1 $f6, t0, y0
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 $f7, t1, y1
|
|
+ MUL alpha2, a0, t1
|
|
+
|
|
+ ADD3 y0, t0, $f6
|
|
+ MUL alpha4, a3, t0
|
|
+ ADD4 y1, t1, $f7
|
|
+ MUL alpha4, a2, t1
|
|
+
|
|
+ ADD3 $f6, t0, y0
|
|
+ ADD4 $f7, t1, y1
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ blbc N, $L990
|
|
+
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ LD alpha2, 1 * SIZE(X)
|
|
+
|
|
+ MUL alpha_r, alpha1, y0
|
|
+ MUL alpha_r, alpha2, y1
|
|
+
|
|
+ MUL alpha_i, alpha2, t0
|
|
+ mov A, A1
|
|
+ MUL alpha_i, alpha1, t1
|
|
+ mov Y, Y1
|
|
+
|
|
+#ifndef XCONJ
|
|
+ SUB y0, t0, alpha1
|
|
+ ADD y1, t1, alpha2
|
|
+#else
|
|
+ ADD y0, t0, alpha1
|
|
+ SUB y1, t1, alpha2
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD a4, 4 * SIZE(A1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD a5, 5 * SIZE(A1)
|
|
+ MUL alpha1, a2, t2
|
|
+ LD a6, 6 * SIZE(A1)
|
|
+ MUL alpha1, a3, t3
|
|
+ LD a7, 7 * SIZE(A1)
|
|
+
|
|
+ ADD1 y0, t0, $f6
|
|
+ unop
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD2 y1, t1, $f7
|
|
+ unop
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD1 y2, t2, $f8
|
|
+ unop
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD2 y3, t3, $f9
|
|
+ unop
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD3 $f6, t0, y0
|
|
+ unop
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ MUL alpha1, a4, t0
|
|
+
|
|
+ ADD4 $f7, t1, y1
|
|
+ unop
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ MUL alpha1, a5, t1
|
|
+
|
|
+ ADD3 $f8, t2, y2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ MUL alpha1, a6, t2
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD4 $f9, t3, y3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ MUL alpha1, a7, t3
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD1 y4, t0, $f6
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a5, t0
|
|
+ LD a5, 13 * SIZE(A1)
|
|
+
|
|
+ ADD2 y5, t1, $f7
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a4, t1
|
|
+ LD a4, 12 * SIZE(A1)
|
|
+
|
|
+ ADD1 y6, t2, $f8
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a7, t2
|
|
+ LD a7, 15 * SIZE(A1)
|
|
+
|
|
+ ADD2 y7, t3, $f9
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a6, t3
|
|
+ LD a6, 14 * SIZE(A1)
|
|
+
|
|
+ ADD3 $f6, t0, y4
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+ MUL alpha1, a0, t0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+
|
|
+ ADD4 $f7, t1, y5
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD3 $f8, t2, y6
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+ MUL alpha1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 $f9, t3, y7
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+ MUL alpha1, a3, t3
|
|
+ unop
|
|
+
|
|
+ ADD1 y0, t0, $f6
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 17 * SIZE(A1)
|
|
+
|
|
+ ADD2 y1, t1, $f7
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 16 * SIZE(A1)
|
|
+
|
|
+ ADD1 y2, t2, $f8
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 19 * SIZE(A1)
|
|
+
|
|
+ ADD2 y3, t3, $f9
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 18 * SIZE(A1)
|
|
+
|
|
+ ADD3 $f6, t0, y0
|
|
+ LD y4, 12 * SIZE(Y1)
|
|
+ MUL alpha1, a4, t0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+
|
|
+ ADD4 $f7, t1, y1
|
|
+ LD y5, 13 * SIZE(Y1)
|
|
+ MUL alpha1, a5, t1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD3 $f8, t2, y2
|
|
+ LD y6, 14 * SIZE(Y1)
|
|
+ MUL alpha1, a6, t2
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD4 $f9, t3, y3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ MUL alpha1, a7, t3
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD1 y4, t0, $f6
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a5, t0
|
|
+ unop
|
|
+
|
|
+ ADD2 y5, t1, $f7
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a4, t1
|
|
+ unop
|
|
+
|
|
+ ADD1 y6, t2, $f8
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a7, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 y7, t3, $f9
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a6, t3
|
|
+ unop
|
|
+
|
|
+ ADD3 $f6, t0, y4
|
|
+ ADD4 $f7, t1, y5
|
|
+ ADD3 $f8, t2, y6
|
|
+ ADD4 $f9, t3, y7
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ unop
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and M, 2, I
|
|
+ ble I, $L27
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, $f6
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 y1, t1, $f7
|
|
+ MUL alpha2, a0, t1
|
|
+ ADD1 y2, t2, $f8
|
|
+ MUL alpha2, a3, t2
|
|
+ ADD2 y3, t3, $f9
|
|
+ MUL alpha2, a2, t3
|
|
+
|
|
+ ADD3 $f6, t0, y0
|
|
+ ADD4 $f7, t1, y1
|
|
+ ADD3 $f8, t2, y2
|
|
+ ADD4 $f9, t3, y3
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ blbc M, $L990
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, $f6
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 y1, t1, $f7
|
|
+ MUL alpha2, a0, t1
|
|
+
|
|
+ ADD3 $f6, t0, y0
|
|
+ ADD4 $f7, t1, y1
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L990:
|
|
+ cmpeq INCY, 2 * SIZE, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ mov BUFFER, Y1
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L995
|
|
+ .align 4
|
|
+
|
|
+$L992:
|
|
+ LD a0, 0 * SIZE(BUFFER)
|
|
+ LD a1, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a2, 0 * SIZE(BUFFER)
|
|
+ LD a3, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y0, 0 * SIZE(Y)
|
|
+ LD y1, 1 * SIZE(Y)
|
|
+ LD y2, 2 * SIZE(Y)
|
|
+ LD y3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 0 * SIZE(BUFFER)
|
|
+ LD a5, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a6, 0 * SIZE(BUFFER)
|
|
+ LD a7, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y4, 4 * SIZE(Y)
|
|
+ LD y5, 5 * SIZE(Y)
|
|
+ LD y6, 6 * SIZE(Y)
|
|
+ LD y7, 7 * SIZE(Y)
|
|
+
|
|
+ ADD a0, y0, $f6
|
|
+ ADD a1, y1, $f7
|
|
+ ADD a2, y2, $f8
|
|
+ ADD a3, y3, $f9
|
|
+
|
|
+ fmov $f6, a0
|
|
+ fmov $f7, a1
|
|
+ fmov $f8, a2
|
|
+ fmov $f9, a3
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ADD a4, y4, $f6
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ ADD a5, y5, $f7
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ST a2, 0 * SIZE(Y1)
|
|
+ ADD a6, y6, $f8
|
|
+ ST a3, 1 * SIZE(Y1)
|
|
+ ADD a7, y7, $f9
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ fmov $f6, a4
|
|
+ fmov $f7, a5
|
|
+ fmov $f8, a6
|
|
+ fmov $f9, a7
|
|
+
|
|
+ ST a4, 0 * SIZE(Y1)
|
|
+ ST a5, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a6, 0 * SIZE(Y1)
|
|
+ ST a7, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ldi Y, 8 * SIZE(Y)
|
|
+ bgt I, $L992
|
|
+ .align 4
|
|
+
|
|
+$L995:
|
|
+ and M, 3, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L996:
|
|
+ LD a0, 0 * SIZE(BUFFER)
|
|
+ LD a1, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y0, 0 * SIZE(Y)
|
|
+ LD y1, 1 * SIZE(Y)
|
|
+ ldi Y, 2 * SIZE(Y)
|
|
+
|
|
+ ADD a0, y0, $f6
|
|
+ ADD a1, y1, $f7
|
|
+
|
|
+ fmov $f6, a0
|
|
+ fmov $f7, a1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L996
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zgemv_n.S.bak b/kernel/sw_64/zgemv_n.S.bak
|
|
new file mode 100644
|
|
index 0000000..3dd482e
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zgemv_n.S.bak
|
|
@@ -0,0 +1,1027 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define STACKSIZE 64
|
|
+#define PREFETCHSIZE 32
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define A $21
|
|
+#define LDA $18
|
|
+
|
|
+#define X $19
|
|
+#define INCX $20
|
|
+#define Y $22
|
|
+#define INCY $23
|
|
+
|
|
+#define BUFFER $24
|
|
+
|
|
+#define I $25
|
|
+#define J $27
|
|
+
|
|
+#define Y1 $4
|
|
+#define A1 $5
|
|
+#define A2 $6
|
|
+
|
|
+#define alpha_r $f19
|
|
+#define alpha_i $f20
|
|
+
|
|
+#define alpha1 $f0
|
|
+#define alpha2 $f1
|
|
+#define alpha3 $f10
|
|
+#define alpha4 $f11
|
|
+
|
|
+#define y0 $f12
|
|
+#define y1 $f13
|
|
+#define y2 $f14
|
|
+#define y3 $f15
|
|
+
|
|
+#define y4 $f16
|
|
+#define y5 $f17
|
|
+#define y6 $f18
|
|
+#define y7 $f21
|
|
+
|
|
+#define a0 $f22
|
|
+#define a1 $f23
|
|
+#define a2 $f24
|
|
+#define a3 $f25
|
|
+#define a4 $f26
|
|
+#define a5 $f27
|
|
+#define a6 $f28
|
|
+#define a7 $f29
|
|
+
|
|
+#define t0 $f2
|
|
+#define t1 $f3
|
|
+#define t2 $f4
|
|
+#define t3 $f5
|
|
+
|
|
+#if !defined(CONJ) && !defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#elif defined(CONJ) && !defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#elif !defined(CONJ) && defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 SUB
|
|
+#define ADD4 SUB
|
|
+#endif
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ ldl LDA, 0 + STACKSIZE($sp)
|
|
+ ldl X, 8 + STACKSIZE($sp)
|
|
+ ldl INCX, 16 + STACKSIZE($sp)
|
|
+ ldl Y, 24 + STACKSIZE($sp)
|
|
+ ldl INCY, 32 + STACKSIZE($sp)
|
|
+ ldl BUFFER, 40 + STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ cmple N, 0, $1
|
|
+ sll INCY, ZBASE_SHIFT, INCY
|
|
+
|
|
+ or $0, $1, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ cmpeq INCY, 2 * SIZE, $0
|
|
+ sll LDA, ZBASE_SHIFT,LDA
|
|
+ bne $0, $L10
|
|
+
|
|
+ mov BUFFER, Y1
|
|
+
|
|
+ mov Y, BUFFER
|
|
+ mov Y1, Y
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L05
|
|
+ .align 4
|
|
+
|
|
+$L02:
|
|
+ ST $f31, 0 * SIZE(Y1)
|
|
+ ST $f31, 1 * SIZE(Y1)
|
|
+ ST $f31, 2 * SIZE(Y1)
|
|
+ ST $f31, 3 * SIZE(Y1)
|
|
+ ST $f31, 4 * SIZE(Y1)
|
|
+ ST $f31, 5 * SIZE(Y1)
|
|
+ ST $f31, 6 * SIZE(Y1)
|
|
+ ST $f31, 7 * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L02
|
|
+ .align 4
|
|
+
|
|
+$L05:
|
|
+ and M, 3, I
|
|
+ ble I, $L10
|
|
+ .align 4
|
|
+
|
|
+$L06:
|
|
+ ST $f31, 0 * SIZE(Y1)
|
|
+ ST $f31, 1 * SIZE(Y1)
|
|
+ addl Y1, 2 * SIZE, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L06
|
|
+ .align 4
|
|
+
|
|
+$L10:
|
|
+ sra N, 1, J
|
|
+ ble J, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ LD alpha2, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD alpha3, 0 * SIZE(X)
|
|
+ LD alpha4, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ MUL alpha_r, alpha1, y0
|
|
+ MUL alpha_r, alpha2, y1
|
|
+ MUL alpha_r, alpha3, y2
|
|
+ MUL alpha_r, alpha4, y3
|
|
+
|
|
+ MUL alpha_i, alpha2, t0
|
|
+ mov A, A1
|
|
+ MUL alpha_i, alpha1, t1
|
|
+ addl A, LDA, A2
|
|
+ MUL alpha_i, alpha4, t2
|
|
+ addl A2, LDA, A
|
|
+ MUL alpha_i, alpha3, t3
|
|
+ mov Y, Y1
|
|
+
|
|
+#ifndef XCONJ
|
|
+ SUB y0, t0, alpha1
|
|
+ ADD y1, t1, alpha2
|
|
+ SUB y2, t2, alpha3
|
|
+ ADD y3, t3, alpha4
|
|
+#else
|
|
+ ADD y0, t0, alpha1
|
|
+ SUB y1, t1, alpha2
|
|
+ ADD y2, t2, alpha3
|
|
+ SUB y3, t3, alpha4
|
|
+#endif
|
|
+
|
|
+ fillcs 4 * SIZE(X)
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ unop
|
|
+ MUL alpha3, a4, t0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+
|
|
+ ADD2 y1, t1, y1
|
|
+ unop
|
|
+ MUL alpha3, a5, t1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y2, t2, y2
|
|
+ unop
|
|
+ MUL alpha3, a6, t2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+
|
|
+ ADD2 y3, t3, y3
|
|
+ unop
|
|
+ MUL alpha3, a7, t3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ unop
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 5 * SIZE(A1)
|
|
+
|
|
+ ADD2 y1, t1, y1
|
|
+ unop
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 4 * SIZE(A1)
|
|
+
|
|
+ ADD1 y2, t2, y2
|
|
+ unop
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 7 * SIZE(A1)
|
|
+
|
|
+ ADD2 y3, t3, y3
|
|
+ unop
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 6 * SIZE(A1)
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ unop
|
|
+ MUL alpha4, a5, t0
|
|
+ LD a5, 5 * SIZE(A2)
|
|
+
|
|
+ ADD4 y1, t1, y1
|
|
+ unop
|
|
+ MUL alpha4, a4, t1
|
|
+ LD a4, 4 * SIZE(A2)
|
|
+
|
|
+ ADD3 y2, t2, y2
|
|
+ unop
|
|
+ MUL alpha4, a7, t2
|
|
+ LD a7, 7 * SIZE(A2)
|
|
+
|
|
+ ADD4 y3, t3, y3
|
|
+ unop
|
|
+ MUL alpha4, a6, t3
|
|
+ LD a6, 6 * SIZE(A2)
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ MUL alpha1, a0, t0
|
|
+ ADD4 y1, t1, y1
|
|
+ MUL alpha1, a1, t1
|
|
+
|
|
+ ADD3 y2, t2, y2
|
|
+ unop
|
|
+ MUL alpha1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 y3, t3, y3
|
|
+ ldi I, -1(I)
|
|
+ MUL alpha1, a3, t3
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD1 y4, t0, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha3, a4, t0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+
|
|
+ ADD2 y5, t1, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha3, a5, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD1 y6, t2, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha3, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 y7, t3, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha3, a7, t3
|
|
+ unop
|
|
+
|
|
+ ADD1 y4, t0, y4
|
|
+ unop
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD2 y5, t1, y5
|
|
+ unop
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD1 y6, t2, y6
|
|
+ unop
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD2 y7, t3, y7
|
|
+ unop
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD3 y4, t0, y4
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+ MUL alpha4, a5, t0
|
|
+ LD a5, 9 * SIZE(A2)
|
|
+
|
|
+ ADD4 y5, t1, y5
|
|
+ unop
|
|
+ MUL alpha4, a4, t1
|
|
+ LD a4, 8 * SIZE(A2)
|
|
+
|
|
+ ADD3 y6, t2, y6
|
|
+ unop
|
|
+ MUL alpha4, a7, t2
|
|
+ LD a7, 11 * SIZE(A2)
|
|
+
|
|
+ ADD4 y7, t3, y7
|
|
+ unop
|
|
+ MUL alpha4, a6, t3
|
|
+ LD a6, 10 * SIZE(A2)
|
|
+
|
|
+ ADD3 y4, t0, y4
|
|
+ unop
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD4 y5, t1, y5
|
|
+ unop
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+
|
|
+ ADD3 y6, t2, y6
|
|
+ unop
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+
|
|
+ ADD4 y7, t3, y7
|
|
+ unop
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ MUL alpha3, a4, t0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+
|
|
+ ADD2 y1, t1, y1
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ MUL alpha3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD1 y2, t2, y2
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ MUL alpha3, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 y3, t3, y3
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ MUL alpha3, a7, t3
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ unop
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 13 * SIZE(A1)
|
|
+
|
|
+ ADD2 y1, t1, y1
|
|
+ unop
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 12 * SIZE(A1)
|
|
+
|
|
+ ADD1 y2, t2, y2
|
|
+ unop
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 15 * SIZE(A1)
|
|
+
|
|
+ ADD2 y3, t3, y3
|
|
+ unop
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 14 * SIZE(A1)
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ unop
|
|
+ MUL alpha4, a5, t0
|
|
+ LD a5, 13 * SIZE(A2)
|
|
+
|
|
+ ADD4 y1, t1, y1
|
|
+ unop
|
|
+ MUL alpha4, a4, t1
|
|
+ LD a4, 12 * SIZE(A2)
|
|
+
|
|
+ ADD3 y2, t2, y2
|
|
+ unop
|
|
+ MUL alpha4, a7, t2
|
|
+ LD a7, 15 * SIZE(A2)
|
|
+
|
|
+ ADD4 y3, t3, y3
|
|
+ unop
|
|
+ MUL alpha4, a6, t3
|
|
+ LD a6, 14 * SIZE(A2)
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ unop
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+
|
|
+ ADD4 y1, t1, y1
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+
|
|
+ ADD3 y2, t2, y2
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+
|
|
+ ADD4 y3, t3, y3
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD1 y4, t0, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha3, a4, t0
|
|
+ unop
|
|
+
|
|
+ ADD2 y5, t1, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD1 y6, t2, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha3, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 y7, t3, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha3, a7, t3
|
|
+ unop
|
|
+
|
|
+ ADD1 y4, t0, y4
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 y5, t1, y5
|
|
+ MUL alpha2, a0, t1
|
|
+
|
|
+ ADD1 y6, t2, y6
|
|
+ MUL alpha2, a3, t2
|
|
+ ADD2 y7, t3, y7
|
|
+ MUL alpha2, a2, t3
|
|
+
|
|
+ ADD3 y4, t0, y4
|
|
+ MUL alpha4, a5, t0
|
|
+ ADD4 y5, t1, y5
|
|
+ MUL alpha4, a4, t1
|
|
+
|
|
+ ADD3 y6, t2, y6
|
|
+ MUL alpha4, a7, t2
|
|
+ ADD4 y7, t3, y7
|
|
+ MUL alpha4, a6, t3
|
|
+
|
|
+ ADD3 y4, t0, y4
|
|
+ ADD4 y5, t1, y5
|
|
+ ADD3 y6, t2, y6
|
|
+ ADD4 y7, t3, y7
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and M, 2, I
|
|
+ ble I, $L17
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD a4, 0 * SIZE(A2)
|
|
+ LD a5, 1 * SIZE(A2)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ MUL alpha3, a4, t0
|
|
+ ADD2 y1, t1, y1
|
|
+ MUL alpha3, a5, t1
|
|
+ ADD1 y2, t2, y2
|
|
+ MUL alpha3, a6, t2
|
|
+ ADD2 y3, t3, y3
|
|
+ MUL alpha3, a7, t3
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 y1, t1, y1
|
|
+ MUL alpha2, a0, t1
|
|
+
|
|
+ ADD1 y2, t2, y2
|
|
+ MUL alpha2, a3, t2
|
|
+ ADD2 y3, t3, y3
|
|
+ MUL alpha2, a2, t3
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ MUL alpha4, a5, t0
|
|
+ ADD4 y1, t1, y1
|
|
+ MUL alpha4, a4, t1
|
|
+
|
|
+ ADD3 y2, t2, y2
|
|
+ MUL alpha4, a7, t2
|
|
+ ADD4 y3, t3, y3
|
|
+ MUL alpha4, a6, t3
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ ADD4 y1, t1, y1
|
|
+ ADD3 y2, t2, y2
|
|
+ ADD4 y3, t3, y3
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ ldi A2, 4 * SIZE(A2)
|
|
+
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ blbc M, $L18
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ MUL alpha1, a1, t1
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ MUL alpha3, a2, t0
|
|
+ ADD2 y1, t1, y1
|
|
+ MUL alpha3, a3, t1
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 y1, t1, y1
|
|
+ MUL alpha2, a0, t1
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ MUL alpha4, a3, t0
|
|
+ ADD4 y1, t1, y1
|
|
+ MUL alpha4, a2, t1
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ ADD4 y1, t1, y1
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ blbc N, $L990
|
|
+
|
|
+ LD alpha1, 0 * SIZE(X)
|
|
+ LD alpha2, 1 * SIZE(X)
|
|
+
|
|
+ MUL alpha_r, alpha1, y0
|
|
+ MUL alpha_r, alpha2, y1
|
|
+
|
|
+ MUL alpha_i, alpha2, t0
|
|
+ mov A, A1
|
|
+ MUL alpha_i, alpha1, t1
|
|
+ mov Y, Y1
|
|
+
|
|
+#ifndef XCONJ
|
|
+ SUB y0, t0, alpha1
|
|
+ ADD y1, t1, alpha2
|
|
+#else
|
|
+ ADD y0, t0, alpha1
|
|
+ SUB y1, t1, alpha2
|
|
+#endif
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD a4, 4 * SIZE(A1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD a5, 5 * SIZE(A1)
|
|
+ MUL alpha1, a2, t2
|
|
+ LD a6, 6 * SIZE(A1)
|
|
+ MUL alpha1, a3, t3
|
|
+ LD a7, 7 * SIZE(A1)
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ unop
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD2 y1, t1, y1
|
|
+ unop
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD1 y2, t2, y2
|
|
+ unop
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 11 * SIZE(A1)
|
|
+
|
|
+ ADD2 y3, t3, y3
|
|
+ unop
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 10 * SIZE(A1)
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ unop
|
|
+ LD y4, 4 * SIZE(Y1)
|
|
+ MUL alpha1, a4, t0
|
|
+
|
|
+ ADD4 y1, t1, y1
|
|
+ unop
|
|
+ LD y5, 5 * SIZE(Y1)
|
|
+ MUL alpha1, a5, t1
|
|
+
|
|
+ ADD3 y2, t2, y2
|
|
+ LD y6, 6 * SIZE(Y1)
|
|
+ MUL alpha1, a6, t2
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD4 y3, t3, y3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ MUL alpha1, a7, t3
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD1 y4, t0, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a5, t0
|
|
+ LD a5, 13 * SIZE(A1)
|
|
+
|
|
+ ADD2 y5, t1, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a4, t1
|
|
+ LD a4, 12 * SIZE(A1)
|
|
+
|
|
+ ADD1 y6, t2, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a7, t2
|
|
+ LD a7, 15 * SIZE(A1)
|
|
+
|
|
+ ADD2 y7, t3, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a6, t3
|
|
+ LD a6, 14 * SIZE(A1)
|
|
+
|
|
+ ADD3 y4, t0, y4
|
|
+ LD y0, 8 * SIZE(Y1)
|
|
+ MUL alpha1, a0, t0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+
|
|
+ ADD4 y5, t1, y5
|
|
+ LD y1, 9 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD3 y6, t2, y6
|
|
+ LD y2, 10 * SIZE(Y1)
|
|
+ MUL alpha1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 y7, t3, y7
|
|
+ LD y3, 11 * SIZE(Y1)
|
|
+ MUL alpha1, a3, t3
|
|
+ unop
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ MUL alpha2, a1, t0
|
|
+ LD a1, 17 * SIZE(A1)
|
|
+
|
|
+ ADD2 y1, t1, y1
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ MUL alpha2, a0, t1
|
|
+ LD a0, 16 * SIZE(A1)
|
|
+
|
|
+ ADD1 y2, t2, y2
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ MUL alpha2, a3, t2
|
|
+ LD a3, 19 * SIZE(A1)
|
|
+
|
|
+ ADD2 y3, t3, y3
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ MUL alpha2, a2, t3
|
|
+ LD a2, 18 * SIZE(A1)
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ LD y4, 12 * SIZE(Y1)
|
|
+ MUL alpha1, a4, t0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
|
|
+
|
|
+ ADD4 y1, t1, y1
|
|
+ LD y5, 13 * SIZE(Y1)
|
|
+ MUL alpha1, a5, t1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD3 y2, t2, y2
|
|
+ LD y6, 14 * SIZE(Y1)
|
|
+ MUL alpha1, a6, t2
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+
|
|
+ ADD4 y3, t3, y3
|
|
+ LD y7, 7 * SIZE(Y1)
|
|
+ MUL alpha1, a7, t3
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD1 y4, t0, y4
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ MUL alpha2, a5, t0
|
|
+ unop
|
|
+
|
|
+ ADD2 y5, t1, y5
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ MUL alpha2, a4, t1
|
|
+ unop
|
|
+
|
|
+ ADD1 y6, t2, y6
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ MUL alpha2, a7, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 y7, t3, y7
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ MUL alpha2, a6, t3
|
|
+ unop
|
|
+
|
|
+ ADD3 y4, t0, y4
|
|
+ ADD4 y5, t1, y5
|
|
+ ADD3 y6, t2, y6
|
|
+ ADD4 y7, t3, y7
|
|
+
|
|
+ ST y4, 4 * SIZE(Y1)
|
|
+ unop
|
|
+ ST y5, 5 * SIZE(Y1)
|
|
+ unop
|
|
+
|
|
+ ST y6, 6 * SIZE(Y1)
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ ST y7, 7 * SIZE(Y1)
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and M, 2, I
|
|
+ ble I, $L27
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 2 * SIZE(A1)
|
|
+ LD a3, 3 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+ MUL alpha1, a2, t2
|
|
+ LD y2, 2 * SIZE(Y1)
|
|
+ MUL alpha1, a3, t3
|
|
+ LD y3, 3 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 y1, t1, y1
|
|
+ MUL alpha2, a0, t1
|
|
+ ADD1 y2, t2, y2
|
|
+ MUL alpha2, a3, t2
|
|
+ ADD2 y3, t3, y3
|
|
+ MUL alpha2, a2, t3
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ ADD4 y1, t1, y1
|
|
+ ADD3 y2, t2, y2
|
|
+ ADD4 y3, t3, y3
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+
|
|
+ ST y2, 2 * SIZE(Y1)
|
|
+ ldi A1, 4 * SIZE(A1)
|
|
+ ST y3, 3 * SIZE(Y1)
|
|
+ ldi Y1, 4 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ blbc M, $L990
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+
|
|
+ MUL alpha1, a0, t0
|
|
+ LD y0, 0 * SIZE(Y1)
|
|
+ MUL alpha1, a1, t1
|
|
+ LD y1, 1 * SIZE(Y1)
|
|
+
|
|
+ ADD1 y0, t0, y0
|
|
+ MUL alpha2, a1, t0
|
|
+ ADD2 y1, t1, y1
|
|
+ MUL alpha2, a0, t1
|
|
+
|
|
+ ADD3 y0, t0, y0
|
|
+ ADD4 y1, t1, y1
|
|
+
|
|
+ ST y0, 0 * SIZE(Y1)
|
|
+ ST y1, 1 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L990:
|
|
+ cmpeq INCY, 2 * SIZE, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ mov BUFFER, Y1
|
|
+
|
|
+ sra M, 2, I
|
|
+ ble I, $L995
|
|
+ .align 4
|
|
+
|
|
+$L992:
|
|
+ LD a0, 0 * SIZE(BUFFER)
|
|
+ LD a1, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a2, 0 * SIZE(BUFFER)
|
|
+ LD a3, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y0, 0 * SIZE(Y)
|
|
+ LD y1, 1 * SIZE(Y)
|
|
+ LD y2, 2 * SIZE(Y)
|
|
+ LD y3, 3 * SIZE(Y)
|
|
+
|
|
+ LD a4, 0 * SIZE(BUFFER)
|
|
+ LD a5, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+ LD a6, 0 * SIZE(BUFFER)
|
|
+ LD a7, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y4, 4 * SIZE(Y)
|
|
+ LD y5, 5 * SIZE(Y)
|
|
+ LD y6, 6 * SIZE(Y)
|
|
+ LD y7, 7 * SIZE(Y)
|
|
+
|
|
+ ADD a0, y0, a0
|
|
+ ADD a1, y1, a1
|
|
+ ADD a2, y2, a2
|
|
+ ADD a3, y3, a3
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ADD a4, y4, a4
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ ADD a5, y5, a5
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ST a2, 0 * SIZE(Y1)
|
|
+ ADD a6, y6, a6
|
|
+ ST a3, 1 * SIZE(Y1)
|
|
+ ADD a7, y7, a7
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ST a4, 0 * SIZE(Y1)
|
|
+ ST a5, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+ ST a6, 0 * SIZE(Y1)
|
|
+ ST a7, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ldi Y, 8 * SIZE(Y)
|
|
+ bgt I, $L992
|
|
+ .align 4
|
|
+
|
|
+$L995:
|
|
+ and M, 3, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L996:
|
|
+ LD a0, 0 * SIZE(BUFFER)
|
|
+ LD a1, 1 * SIZE(BUFFER)
|
|
+ addl BUFFER, INCY, BUFFER
|
|
+
|
|
+ LD y0, 0 * SIZE(Y)
|
|
+ LD y1, 1 * SIZE(Y)
|
|
+ ldi Y, 2 * SIZE(Y)
|
|
+
|
|
+ ADD a0, y0, a0
|
|
+ ADD a1, y1, a1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L996
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zgemv_t.S b/kernel/sw_64/zgemv_t.S
|
|
new file mode 100644
|
|
index 0000000..bf31cb4
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zgemv_t.S
|
|
@@ -0,0 +1,1047 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define STACKSIZE 64
|
|
+#define PREFETCHSIZE 32
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define A $21
|
|
+#define LDA $18
|
|
+
|
|
+#define X $19
|
|
+#define INCX $20
|
|
+#define Y $22
|
|
+#define INCY $23
|
|
+
|
|
+#define BUFFER $24
|
|
+
|
|
+#define I $25
|
|
+#define J $27
|
|
+
|
|
+#define X1 $3
|
|
+#define Y1 $4
|
|
+#define A1 $5
|
|
+#define A2 $6
|
|
+
|
|
+#define alpha_r $f19
|
|
+#define alpha_i $f20
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f21
|
|
+
|
|
+#define a0 $f22
|
|
+#define a1 $f23
|
|
+#define a2 $f24
|
|
+#define a3 $f25
|
|
+#define a4 $f26
|
|
+#define a5 $f27
|
|
+#define a6 $f28
|
|
+#define a7 $f29
|
|
+
|
|
+#define a8 $f2
|
|
+#define a9 $f3
|
|
+#define a10 $f4
|
|
+#define a11 $f5
|
|
+#define a12 $f6
|
|
+#define a13 $f7
|
|
+#define a14 $f8
|
|
+#define a15 $f9
|
|
+
|
|
+#if !defined(CONJ) && !defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#elif !defined(CONJ) && defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#elif defined(CONJ) && !defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 SUB
|
|
+#define ADD4 SUB
|
|
+#endif
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ ldl LDA, 0 + STACKSIZE($sp)
|
|
+ ldl X, 8 + STACKSIZE($sp)
|
|
+ ldl INCX, 16 + STACKSIZE($sp)
|
|
+ ldl Y, 24 + STACKSIZE($sp)
|
|
+ ldl INCY, 32 + STACKSIZE($sp)
|
|
+ ldl BUFFER, 40 + STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ cmple N, 0, $1
|
|
+ sll INCY, ZBASE_SHIFT, INCY
|
|
+
|
|
+ or $0, $1, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ cmpeq INCX, 2 * SIZE, $0
|
|
+ mov X, X1
|
|
+ sll LDA, ZBASE_SHIFT,LDA
|
|
+ bne $0, $L10
|
|
+
|
|
+ sra M, 2, I
|
|
+ mov BUFFER, Y1
|
|
+ mov BUFFER, X
|
|
+ ble I, $L05
|
|
+ .align 4
|
|
+
|
|
+$L02:
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ LD a0, 0 * SIZE(X1)
|
|
+ LD a1, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a2, 0 * SIZE(X1)
|
|
+ LD a3, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ ST a2, 2 * SIZE(Y1)
|
|
+ ST a3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a4, 0 * SIZE(X1)
|
|
+ LD a5, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a6, 0 * SIZE(X1)
|
|
+ LD a7, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a4, 4 * SIZE(Y1)
|
|
+ ST a5, 5 * SIZE(Y1)
|
|
+ ST a6, 6 * SIZE(Y1)
|
|
+ ST a7, 7 * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ bgt I, $L02
|
|
+ .align 4
|
|
+
|
|
+$L05:
|
|
+ and M, 3, I
|
|
+ ble I, $L10
|
|
+ .align 4
|
|
+
|
|
+$L06:
|
|
+ LD a0, 0 * SIZE(X1)
|
|
+ LD a1, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ ldi Y1, 2 * SIZE(Y1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L06
|
|
+ .align 4
|
|
+
|
|
+$L10:
|
|
+ mov Y, Y1
|
|
+ fclr t0
|
|
+ unop
|
|
+ fclr t1
|
|
+
|
|
+ sra N, 1, J
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ ble J, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ mov A, A1
|
|
+ fclr s0
|
|
+ addl A, LDA, A2
|
|
+ fclr s1
|
|
+
|
|
+ addl A2, LDA, A
|
|
+ unop
|
|
+ mov X, X1
|
|
+ fillcs 3 * SIZE(Y)
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+ LD a4, 2 * SIZE(A1)
|
|
+ LD a5, 3 * SIZE(A1)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD a8, 4 * SIZE(A1)
|
|
+ LD a9, 5 * SIZE(A1)
|
|
+ LD a10, 4 * SIZE(A2)
|
|
+ LD a11, 5 * SIZE(A2)
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+ LD a13, 7 * SIZE(A1)
|
|
+ LD a14, 6 * SIZE(A2)
|
|
+ LD a15, 7 * SIZE(A2)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x0, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x1, a1, t0
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x1, a0, t1
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x1, a3, t2
|
|
+ LD a3, 9 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x1, a2, t3
|
|
+ LD a2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x2, a4, t0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x2, a5, t1
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x2, a6, t2
|
|
+
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x2, a7, t3
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x3, a5, t0
|
|
+ LD a5, 11 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x3, a4, t1
|
|
+ LD a4, 10 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x3, a7, t2
|
|
+ LD a7, 11 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x3, a6, t3
|
|
+ LD a6, 10 * SIZE(A2)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x0, a8, t0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+ MUL x0, a9, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ ldi I, -1(I)
|
|
+ MUL x0, a10, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x0, a11, t3
|
|
+ LD x0, 8 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x1, a9, t0
|
|
+ LD a9, 13 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x1, a8, t1
|
|
+ LD a8, 12 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x1, a11, t2
|
|
+ LD a11, 13 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x1, a10, t3
|
|
+ LD a10, 12 * SIZE(A2)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x2, a12, t0
|
|
+ LD x1, 9 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
|
|
+ MUL x2, a13, t1
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x2, a14, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x2, a15, t3
|
|
+ LD x2, 10 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x3, a13, t0
|
|
+ LD a13, 7 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x3, a12, t1
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x3, a15, t2
|
|
+ LD a15, 7 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x3, a14, t3
|
|
+ LD a14, 6 * SIZE(A2)
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x0, a1, t1
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x0, a2, t2
|
|
+
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x1, a1, t0
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x1, a0, t1
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x1, a3, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x1, a2, t3
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x2, a4, t0
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x2, a5, t1
|
|
+
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x2, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ MUL x2, a7, t3
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x3, a5, t0
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x3, a4, t1
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x3, a7, t2
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x3, a6, t3
|
|
+ LD x3, -1 * SIZE(X1)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x0, a8, t0
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x0, a9, t1
|
|
+
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x0, a10, t2
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x0, a11, t3
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x1, a9, t0
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x1, a8, t1
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x1, a11, t2
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x1, a10, t3
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x2, a12, t0
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x2, a13, t1
|
|
+
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x2, a14, t2
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x2, a15, t3
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x3, a13, t0
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x3, a12, t1
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x3, a15, t2
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x3, a14, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and M, 3, I
|
|
+ ble I, $L18
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ ldi I, -1(I)
|
|
+ MUL x0, a0, t0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x0, a1, t1
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x0, a2, t2
|
|
+
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD x0, 2 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ ldi A2, 2 * SIZE(A2)
|
|
+ MUL x1, a1, t0
|
|
+ LD a1, 3 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ ldi X1, 2 * SIZE(X1)
|
|
+ MUL x1, a0, t1
|
|
+ LD a0, 2 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ MUL x1, a3, t2
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x1, a2, t3
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x0, a2, t2
|
|
+ ADD4 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x0, a3, t3
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x1, a1, t0
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x1, a0, t1
|
|
+
|
|
+ ADD1 s2, t2, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x1, a3, t2
|
|
+ ADD2 s3, t3, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x1, a2, t3
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+ unop
|
|
+ LD a1, 1 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(Y)
|
|
+ unop
|
|
+ LD a3, 1 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+
|
|
+ ADD3 s0, t0, a8
|
|
+ ADD4 s1, t1, a9
|
|
+ ADD3 s2, t2, a10
|
|
+ ADD4 s3, t3, a11
|
|
+
|
|
+ fmov a8, s0
|
|
+ fmov a9, s1
|
|
+ fmov a10, s2
|
|
+ fmov a11, s3
|
|
+
|
|
+ MUL alpha_r, s0, t0
|
|
+ MUL alpha_r, s1, t1
|
|
+ MUL alpha_r, s2, t2
|
|
+ MUL alpha_r, s3, t3
|
|
+
|
|
+ ADD a0, t0, a8
|
|
+ MUL alpha_i, s1, t0
|
|
+ ADD a1, t1, a9
|
|
+ MUL alpha_i, s0, t1
|
|
+ ADD a2, t2, a10
|
|
+ MUL alpha_i, s3, t2
|
|
+ ADD a3, t3, a11
|
|
+ MUL alpha_i, s2, t3
|
|
+
|
|
+ SUB a8, t0, a0
|
|
+ ADD a9, t1, a1
|
|
+ SUB a10, t2, a2
|
|
+ ADD a11, t3, a3
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ fclr t0
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ST a2, 0 * SIZE(Y1)
|
|
+ fclr t1
|
|
+ ST a3, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ fclr t2
|
|
+ ldi J, -1(J)
|
|
+ fclr t3
|
|
+ bgt J, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ blbc N, $L999
|
|
+
|
|
+ mov A, A1
|
|
+ fclr s0
|
|
+ fclr s1
|
|
+ mov X, X1
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a4, 2 * SIZE(A1)
|
|
+ LD a5, 3 * SIZE(A1)
|
|
+ LD a8, 4 * SIZE(A1)
|
|
+ LD a9, 5 * SIZE(A1)
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+ LD a13, 7 * SIZE(A1)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ MUL x0, a0, t0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, $f30
|
|
+ fmov $f30, s2
|
|
+ ldi I, -1(I)
|
|
+ MUL x1, a1, t0
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x1, a0, t1
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x2, a4, t0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x2, a5, t1
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x3, a5, t0
|
|
+ LD a5, 11 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x3, a4, t1
|
|
+ LD a4, 10 * SIZE(A1)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x0, a8, t0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x0, a9, t1
|
|
+ LD x0, 8 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x1, a9, t0
|
|
+ LD a9, 13 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x1, a8, t1
|
|
+ LD a8, 12 * SIZE(A1)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x2, a12, t0
|
|
+ LD x1, 9 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x2, a13, t1
|
|
+ LD x2, 10 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, $f30
|
|
+ fmov $f30, s2
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x3, a13, t0
|
|
+ LD a13, 7 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x3, a12, t1
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x1, a1, t0
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x1, a0, t1
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x2, a4, t0
|
|
+ unop
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x2, a5, t1
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, $f30
|
|
+ fmov $f30, s2
|
|
+ unop
|
|
+ MUL x3, a5, t0
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+
|
|
+ ADD2 s3, t1, $f30
|
|
+ fmov $f30, s3
|
|
+ unop
|
|
+ MUL x3, a4, t1
|
|
+ LD x3, -1 * SIZE(X1)
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x0, a8, t0
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x0, a9, t1
|
|
+
|
|
+ ADD1 s2, t0, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x1, a9, t0
|
|
+ ADD2 s3, t1, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x1, a8, t1
|
|
+
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x2, a12, t0
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x2, a13, t1
|
|
+
|
|
+ ADD1 s2, t0, $f30
|
|
+ fmov $f30, s2
|
|
+ MUL x3, a13, t0
|
|
+ ADD2 s3, t1, $f30
|
|
+ fmov $f30, s3
|
|
+ MUL x3, a12, t1
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and M, 3, I
|
|
+ ble I, $L28
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L27
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ MUL x0, a0, t0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ ldi I, -1(I)
|
|
+ MUL x0, a1, t1
|
|
+ LD x0, 2 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ ldi X1, 2 * SIZE(X1)
|
|
+ MUL x1, a1, t0
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x1, a0, t1
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD3 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD1 s0, t0, $f30
|
|
+ fmov $f30, s0
|
|
+ MUL x1, a1, t0
|
|
+ ADD2 s1, t1, $f30
|
|
+ fmov $f30, s1
|
|
+ MUL x1, a0, t1
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+ LD a1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD3 s0, t0, a8
|
|
+ ADD4 s1, t1, a9
|
|
+ ADD3 s2, t2, a10
|
|
+ ADD4 s3, t3, a11
|
|
+
|
|
+ ADD a8, a10, s0
|
|
+ ADD a9, a11, s1
|
|
+
|
|
+ MUL alpha_r, s0, t0
|
|
+ MUL alpha_r, s1, t1
|
|
+
|
|
+ ADD a0, t0, a8
|
|
+ MUL alpha_i, s1, t0
|
|
+ ADD a1, t1, a9
|
|
+ MUL alpha_i, s0, t1
|
|
+
|
|
+ SUB a8, t0, a0
|
|
+ ADD a9, t1, a1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zgemv_t.S.bak b/kernel/sw_64/zgemv_t.S.bak
|
|
new file mode 100644
|
|
index 0000000..f857fb7
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zgemv_t.S.bak
|
|
@@ -0,0 +1,922 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define STACKSIZE 64
|
|
+#define PREFETCHSIZE 32
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define A $21
|
|
+#define LDA $18
|
|
+
|
|
+#define X $19
|
|
+#define INCX $20
|
|
+#define Y $22
|
|
+#define INCY $23
|
|
+
|
|
+#define BUFFER $24
|
|
+
|
|
+#define I $25
|
|
+#define J $27
|
|
+
|
|
+#define X1 $3
|
|
+#define Y1 $4
|
|
+#define A1 $5
|
|
+#define A2 $6
|
|
+
|
|
+#define alpha_r $f19
|
|
+#define alpha_i $f20
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f21
|
|
+
|
|
+#define a0 $f22
|
|
+#define a1 $f23
|
|
+#define a2 $f24
|
|
+#define a3 $f25
|
|
+#define a4 $f26
|
|
+#define a5 $f27
|
|
+#define a6 $f28
|
|
+#define a7 $f29
|
|
+
|
|
+#define a8 $f2
|
|
+#define a9 $f3
|
|
+#define a10 $f4
|
|
+#define a11 $f5
|
|
+#define a12 $f6
|
|
+#define a13 $f7
|
|
+#define a14 $f8
|
|
+#define a15 $f9
|
|
+
|
|
+#if !defined(CONJ) && !defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#elif !defined(CONJ) && defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#elif defined(CONJ) && !defined(XCONJ)
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 SUB
|
|
+#define ADD4 SUB
|
|
+#endif
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+ ldl LDA, 0 + STACKSIZE($sp)
|
|
+ ldl X, 8 + STACKSIZE($sp)
|
|
+ ldl INCX, 16 + STACKSIZE($sp)
|
|
+ ldl Y, 24 + STACKSIZE($sp)
|
|
+ ldl INCY, 32 + STACKSIZE($sp)
|
|
+ ldl BUFFER, 40 + STACKSIZE($sp)
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ cmple N, 0, $1
|
|
+ sll INCY, ZBASE_SHIFT, INCY
|
|
+
|
|
+ or $0, $1, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+ cmpeq INCX, 2 * SIZE, $0
|
|
+ mov X, X1
|
|
+ sll LDA, ZBASE_SHIFT,LDA
|
|
+ bne $0, $L10
|
|
+
|
|
+ sra M, 2, I
|
|
+ mov BUFFER, Y1
|
|
+ mov BUFFER, X
|
|
+ ble I, $L05
|
|
+ .align 4
|
|
+
|
|
+$L02:
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ LD a0, 0 * SIZE(X1)
|
|
+ LD a1, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a2, 0 * SIZE(X1)
|
|
+ LD a3, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ ST a2, 2 * SIZE(Y1)
|
|
+ ST a3, 3 * SIZE(Y1)
|
|
+
|
|
+ LD a4, 0 * SIZE(X1)
|
|
+ LD a5, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+ LD a6, 0 * SIZE(X1)
|
|
+ LD a7, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a4, 4 * SIZE(Y1)
|
|
+ ST a5, 5 * SIZE(Y1)
|
|
+ ST a6, 6 * SIZE(Y1)
|
|
+ ST a7, 7 * SIZE(Y1)
|
|
+
|
|
+ ldi Y1, 8 * SIZE(Y1)
|
|
+ bgt I, $L02
|
|
+ .align 4
|
|
+
|
|
+$L05:
|
|
+ and M, 3, I
|
|
+ ble I, $L10
|
|
+ .align 4
|
|
+
|
|
+$L06:
|
|
+ LD a0, 0 * SIZE(X1)
|
|
+ LD a1, 1 * SIZE(X1)
|
|
+ addl X1, INCX, X1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ ldi Y1, 2 * SIZE(Y1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L06
|
|
+ .align 4
|
|
+
|
|
+$L10:
|
|
+ mov Y, Y1
|
|
+ fclr t0
|
|
+ unop
|
|
+ fclr t1
|
|
+
|
|
+ sra N, 1, J
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ ble J, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ mov A, A1
|
|
+ fclr s0
|
|
+ addl A, LDA, A2
|
|
+ fclr s1
|
|
+
|
|
+ addl A2, LDA, A
|
|
+ unop
|
|
+ mov X, X1
|
|
+ fillcs 3 * SIZE(Y)
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+ LD a4, 2 * SIZE(A1)
|
|
+ LD a5, 3 * SIZE(A1)
|
|
+ LD a6, 2 * SIZE(A2)
|
|
+ LD a7, 3 * SIZE(A2)
|
|
+
|
|
+ LD a8, 4 * SIZE(A1)
|
|
+ LD a9, 5 * SIZE(A1)
|
|
+ LD a10, 4 * SIZE(A2)
|
|
+ LD a11, 5 * SIZE(A2)
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+ LD a13, 7 * SIZE(A1)
|
|
+ LD a14, 6 * SIZE(A2)
|
|
+ LD a15, 7 * SIZE(A2)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ MUL x0, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 s2, t2, s2
|
|
+ unop
|
|
+ MUL x0, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 s3, t3, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ unop
|
|
+ MUL x1, a1, t0
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, s1
|
|
+ unop
|
|
+ MUL x1, a0, t1
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ unop
|
|
+ MUL x1, a3, t2
|
|
+ LD a3, 9 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, s3
|
|
+ unop
|
|
+ MUL x1, a2, t3
|
|
+ LD a2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x2, a4, t0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ MUL x2, a5, t1
|
|
+ ADD3 s2, t2, s2
|
|
+ MUL x2, a6, t2
|
|
+
|
|
+ ADD4 s3, t3, s3
|
|
+ unop
|
|
+ MUL x2, a7, t3
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ unop
|
|
+ MUL x3, a5, t0
|
|
+ LD a5, 11 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, s1
|
|
+ unop
|
|
+ MUL x3, a4, t1
|
|
+ LD a4, 10 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ unop
|
|
+ MUL x3, a7, t2
|
|
+ LD a7, 11 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, s3
|
|
+ unop
|
|
+ MUL x3, a6, t3
|
|
+ LD a6, 10 * SIZE(A2)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x0, a8, t0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
|
|
+ MUL x0, a9, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 s2, t2, s2
|
|
+ ldi I, -1(I)
|
|
+ MUL x0, a10, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 s3, t3, s3
|
|
+ unop
|
|
+ MUL x0, a11, t3
|
|
+ LD x0, 8 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ unop
|
|
+ MUL x1, a9, t0
|
|
+ LD a9, 13 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, s1
|
|
+ unop
|
|
+ MUL x1, a8, t1
|
|
+ LD a8, 12 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x1, a11, t2
|
|
+ LD a11, 13 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, s3
|
|
+ unop
|
|
+ MUL x1, a10, t3
|
|
+ LD a10, 12 * SIZE(A2)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x2, a12, t0
|
|
+ LD x1, 9 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
|
|
+ MUL x2, a13, t1
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+
|
|
+ ADD3 s2, t2, s2
|
|
+ unop
|
|
+ MUL x2, a14, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 s3, t3, s3
|
|
+ unop
|
|
+ MUL x2, a15, t3
|
|
+ LD x2, 10 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ unop
|
|
+ MUL x3, a13, t0
|
|
+ LD a13, 7 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, s1
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x3, a12, t1
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ unop
|
|
+ MUL x3, a15, t2
|
|
+ LD a15, 7 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, s3
|
|
+ MUL x3, a14, t3
|
|
+ LD a14, 6 * SIZE(A2)
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ MUL x0, a1, t1
|
|
+ ADD3 s2, t2, s2
|
|
+ MUL x0, a2, t2
|
|
+
|
|
+ ADD4 s3, t3, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ MUL x1, a1, t0
|
|
+ ADD2 s1, t1, s1
|
|
+ MUL x1, a0, t1
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ unop
|
|
+ MUL x1, a3, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 s3, t3, s3
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x1, a2, t3
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ MUL x2, a4, t0
|
|
+ ADD4 s1, t1, s1
|
|
+ MUL x2, a5, t1
|
|
+
|
|
+ ADD3 s2, t2, s2
|
|
+ unop
|
|
+ MUL x2, a6, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 s3, t3, s3
|
|
+ ldi A2, 8 * SIZE(A2)
|
|
+ MUL x2, a7, t3
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ MUL x3, a5, t0
|
|
+ ADD2 s1, t1, s1
|
|
+ MUL x3, a4, t1
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ unop
|
|
+ MUL x3, a7, t2
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+
|
|
+ ADD2 s3, t3, s3
|
|
+ unop
|
|
+ MUL x3, a6, t3
|
|
+ LD x3, -1 * SIZE(X1)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ MUL x0, a8, t0
|
|
+ ADD4 s1, t1, s1
|
|
+ MUL x0, a9, t1
|
|
+
|
|
+ ADD3 s2, t2, s2
|
|
+ MUL x0, a10, t2
|
|
+ ADD4 s3, t3, s3
|
|
+ MUL x0, a11, t3
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ MUL x1, a9, t0
|
|
+ ADD2 s1, t1, s1
|
|
+ MUL x1, a8, t1
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ MUL x1, a11, t2
|
|
+ ADD2 s3, t3, s3
|
|
+ MUL x1, a10, t3
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ MUL x2, a12, t0
|
|
+ ADD4 s1, t1, s1
|
|
+ MUL x2, a13, t1
|
|
+
|
|
+ ADD3 s2, t2, s2
|
|
+ MUL x2, a14, t2
|
|
+ ADD4 s3, t3, s3
|
|
+ MUL x2, a15, t3
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ MUL x3, a13, t0
|
|
+ ADD2 s1, t1, s1
|
|
+ MUL x3, a12, t1
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ MUL x3, a15, t2
|
|
+ ADD2 s3, t3, s3
|
|
+ MUL x3, a14, t3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and M, 3, I
|
|
+ ble I, $L18
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ ADD3 s0, t0, s0
|
|
+ ldi I, -1(I)
|
|
+ MUL x0, a0, t0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ MUL x0, a1, t1
|
|
+ ADD3 s2, t2, s2
|
|
+ MUL x0, a2, t2
|
|
+
|
|
+ ADD4 s3, t3, s3
|
|
+ unop
|
|
+ MUL x0, a3, t3
|
|
+ LD x0, 2 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ ldi A2, 2 * SIZE(A2)
|
|
+ MUL x1, a1, t0
|
|
+ LD a1, 3 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, s1
|
|
+ ldi X1, 2 * SIZE(X1)
|
|
+ MUL x1, a0, t1
|
|
+ LD a0, 2 * SIZE(A1)
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ MUL x1, a3, t2
|
|
+ LD a3, 1 * SIZE(A2)
|
|
+
|
|
+ ADD2 s3, t3, s3
|
|
+ MUL x1, a2, t3
|
|
+ LD a2, 0 * SIZE(A2)
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 s2, t2, s2
|
|
+ MUL x0, a2, t2
|
|
+ ADD4 s3, t3, s3
|
|
+ MUL x0, a3, t3
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ MUL x1, a1, t0
|
|
+ ADD2 s1, t1, s1
|
|
+ MUL x1, a0, t1
|
|
+
|
|
+ ADD1 s2, t2, s2
|
|
+ MUL x1, a3, t2
|
|
+ ADD2 s3, t3, s3
|
|
+ MUL x1, a2, t3
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+ unop
|
|
+ LD a1, 1 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+
|
|
+ LD a2, 0 * SIZE(Y)
|
|
+ unop
|
|
+ LD a3, 1 * SIZE(Y)
|
|
+ addl Y, INCY, Y
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ ADD4 s1, t1, s1
|
|
+ ADD3 s2, t2, s2
|
|
+ ADD4 s3, t3, s3
|
|
+
|
|
+ MUL alpha_r, s0, t0
|
|
+ MUL alpha_r, s1, t1
|
|
+ MUL alpha_r, s2, t2
|
|
+ MUL alpha_r, s3, t3
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL alpha_i, s1, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL alpha_i, s0, t1
|
|
+ ADD a2, t2, a2
|
|
+ MUL alpha_i, s3, t2
|
|
+ ADD a3, t3, a3
|
|
+ MUL alpha_i, s2, t3
|
|
+
|
|
+ SUB a0, t0, a0
|
|
+ ADD a1, t1, a1
|
|
+ SUB a2, t2, a2
|
|
+ ADD a3, t3, a3
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ fclr t0
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ ST a2, 0 * SIZE(Y1)
|
|
+ fclr t1
|
|
+ ST a3, 1 * SIZE(Y1)
|
|
+ addl Y1, INCY, Y1
|
|
+
|
|
+ fclr t2
|
|
+ ldi J, -1(J)
|
|
+ fclr t3
|
|
+ bgt J, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ blbc N, $L999
|
|
+
|
|
+ mov A, A1
|
|
+ fclr s0
|
|
+ fclr s1
|
|
+ mov X, X1
|
|
+
|
|
+ sra M, 2, I
|
|
+ fclr s2
|
|
+ fclr s3
|
|
+ ble I, $L25
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+ LD a4, 2 * SIZE(A1)
|
|
+ LD a5, 3 * SIZE(A1)
|
|
+ LD a8, 4 * SIZE(A1)
|
|
+ LD a9, 5 * SIZE(A1)
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+ LD a13, 7 * SIZE(A1)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+ LD x2, 2 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L23
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD3 s0, t0, s0
|
|
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
|
|
+ MUL x0, a0, t0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, s2
|
|
+ ldi I, -1(I)
|
|
+ MUL x1, a1, t0
|
|
+ LD a1, 9 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, s3
|
|
+ unop
|
|
+ MUL x1, a0, t1
|
|
+ LD a0, 8 * SIZE(A1)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x2, a4, t0
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ unop
|
|
+ MUL x2, a5, t1
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, s2
|
|
+ unop
|
|
+ MUL x3, a5, t0
|
|
+ LD a5, 11 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, s3
|
|
+ unop
|
|
+ MUL x3, a4, t1
|
|
+ LD a4, 10 * SIZE(A1)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x0, a8, t0
|
|
+ LD x3, 7 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ unop
|
|
+ MUL x0, a9, t1
|
|
+ LD x0, 8 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, s2
|
|
+ unop
|
|
+ MUL x1, a9, t0
|
|
+ LD a9, 13 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, s3
|
|
+ unop
|
|
+ MUL x1, a8, t1
|
|
+ LD a8, 12 * SIZE(A1)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x2, a12, t0
|
|
+ LD x1, 9 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+ MUL x2, a13, t1
|
|
+ LD x2, 10 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, s2
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+ MUL x3, a13, t0
|
|
+ LD a13, 7 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, s3
|
|
+ MUL x3, a12, t1
|
|
+ LD a12, 6 * SIZE(A1)
|
|
+ bgt I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L23:
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x3, 3 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ LD x0, 4 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, s2
|
|
+ unop
|
|
+ MUL x1, a1, t0
|
|
+ ldi A1, 8 * SIZE(A1)
|
|
+
|
|
+ ADD2 s3, t1, s3
|
|
+ unop
|
|
+ MUL x1, a0, t1
|
|
+ LD x1, 5 * SIZE(X1)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x2, a4, t0
|
|
+ unop
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ unop
|
|
+ MUL x2, a5, t1
|
|
+ LD x2, 6 * SIZE(X1)
|
|
+
|
|
+ ADD1 s2, t0, s2
|
|
+ unop
|
|
+ MUL x3, a5, t0
|
|
+ ldi X1, 8 * SIZE(X1)
|
|
+
|
|
+ ADD2 s3, t1, s3
|
|
+ unop
|
|
+ MUL x3, a4, t1
|
|
+ LD x3, -1 * SIZE(X1)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ MUL x0, a8, t0
|
|
+ ADD4 s1, t1, s1
|
|
+ MUL x0, a9, t1
|
|
+
|
|
+ ADD1 s2, t0, s2
|
|
+ MUL x1, a9, t0
|
|
+ ADD2 s3, t1, s3
|
|
+ MUL x1, a8, t1
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ MUL x2, a12, t0
|
|
+ ADD4 s1, t1, s1
|
|
+ MUL x2, a13, t1
|
|
+
|
|
+ ADD1 s2, t0, s2
|
|
+ MUL x3, a13, t0
|
|
+ ADD2 s3, t1, s3
|
|
+ MUL x3, a12, t1
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and M, 3, I
|
|
+ ble I, $L28
|
|
+
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+
|
|
+ LD x0, 0 * SIZE(X1)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L27
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ ADD3 s0, t0, s0
|
|
+ ldi A1, 2 * SIZE(A1)
|
|
+ MUL x0, a0, t0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ ldi I, -1(I)
|
|
+ MUL x0, a1, t1
|
|
+ LD x0, 2 * SIZE(X1)
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ ldi X1, 2 * SIZE(X1)
|
|
+ MUL x1, a1, t0
|
|
+ LD a1, 1 * SIZE(A1)
|
|
+
|
|
+ ADD2 s1, t1, s1
|
|
+ MUL x1, a0, t1
|
|
+ LD a0, 0 * SIZE(A1)
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD3 s0, t0, s0
|
|
+ unop
|
|
+ MUL x0, a0, t0
|
|
+ LD x1, 1 * SIZE(X1)
|
|
+
|
|
+ ADD4 s1, t1, s1
|
|
+ unop
|
|
+ MUL x0, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD1 s0, t0, s0
|
|
+ MUL x1, a1, t0
|
|
+ ADD2 s1, t1, s1
|
|
+ MUL x1, a0, t1
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+ LD a0, 0 * SIZE(Y)
|
|
+ LD a1, 1 * SIZE(Y)
|
|
+
|
|
+ ADD3 s0, t0, s0
|
|
+ ADD4 s1, t1, s1
|
|
+ ADD3 s2, t2, s2
|
|
+ ADD4 s3, t3, s3
|
|
+
|
|
+ ADD s0, s2, s0
|
|
+ ADD s1, s3, s1
|
|
+
|
|
+ MUL alpha_r, s0, t0
|
|
+ MUL alpha_r, s1, t1
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL alpha_i, s1, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL alpha_i, s0, t1
|
|
+
|
|
+ SUB a0, t0, a0
|
|
+ ADD a1, t1, a1
|
|
+
|
|
+ ST a0, 0 * SIZE(Y1)
|
|
+ ST a1, 1 * SIZE(Y1)
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/znrm2.S b/kernel/sw_64/znrm2.S
|
|
new file mode 100644
|
|
index 0000000..c1b7375
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/znrm2.S
|
|
@@ -0,0 +1,441 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ .frame $30,16,$26,0
|
|
+ .mask 0x4000000,-16
|
|
+ ldih $29, 0($27) !gpdisp!1
|
|
+ ldi $29, 0($29) !gpdisp!1
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ ldl $27, sqrt($29) !literal!2
|
|
+ stl $26, 0($sp)
|
|
+
|
|
+ PROFCODE
|
|
+ .prologue 1
|
|
+#else
|
|
+ PROFCODE
|
|
+#endif
|
|
+
|
|
+ fclr a0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, 2 * SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ faddd a0, t0, $f25
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ mov X, XX
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ unop
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, $f25
|
|
+ unop
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ ldi I, -1(I)
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd a0, t0, $f25
|
|
+ mov X, XX
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, $f25
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, $f26
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, $f28
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd $f26, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd $f28, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ fmov $f27, a2
|
|
+ faddd a3, t3, $f28
|
|
+ fmov $f28, a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ ldi X, 2 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, $f25
|
|
+ fmov $f25, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, $f26
|
|
+ fmov $f26, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 2, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ faddd a0, t0, $f25
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ fmuld x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fmuld x3, x3, t3
|
|
+ unop
|
|
+
|
|
+ faddd $f25, t0, a0
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ fmuld x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ fmuld x5, x5, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ faddd $f27, t2, a2
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ fmuld x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ faddd a0, t0, $f25
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, $f26
|
|
+ fmuld x1, x1, t1
|
|
+ faddd a2, t2, $f27
|
|
+ fmuld x2, x2, t2
|
|
+
|
|
+ faddd a3, t3, $f28
|
|
+ fmuld x3, x3, t3
|
|
+ faddd $f25, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+
|
|
+ faddd $f26, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+ faddd $f27, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+
|
|
+ faddd $f28, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a2, t2, $f27
|
|
+ fmov $f27, a2
|
|
+ faddd a3, t3, $f28
|
|
+ fmov $f28, a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 3, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, $f25
|
|
+ fmov $f25, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, $f26
|
|
+ fmov $f26, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ faddd a0, t0, $f25
|
|
+ faddd a1, t1, $f26
|
|
+ fmov $f25, a0
|
|
+ fmov $f26, a1
|
|
+
|
|
+ faddd a0, a1, $f25
|
|
+ fmov $f25, a0
|
|
+ faddd a2, a3, $f26
|
|
+ fmov $f26, a2
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ faddd a0, a2, $f16
|
|
+ jsr $26, ($27), sqrt !lituse_jsr!2
|
|
+
|
|
+ ldih $29, 0($26) !gpdisp!3
|
|
+ ldi $29, 0($29) !gpdisp!3
|
|
+#else
|
|
+ faddd a0, a2, $f25
|
|
+ fmov $f25, a0
|
|
+ fsqrtd a0, $f25
|
|
+ fmov $f25, a0
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ ldl $26, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/znrm2.S.bak b/kernel/sw_64/znrm2.S.bak
|
|
new file mode 100644
|
|
index 0000000..b2e80e0
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/znrm2.S.bak
|
|
@@ -0,0 +1,426 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ .frame $30,16,$26,0
|
|
+ .mask 0x4000000,-16
|
|
+ ldih $29, 0($27) !gpdisp!1
|
|
+ ldi $29, 0($29) !gpdisp!1
|
|
+
|
|
+ ldi $sp, -16($sp)
|
|
+ ldl $27, sqrt($29) !literal!2
|
|
+ stq $26, 0($sp)
|
|
+
|
|
+ PROFCODE
|
|
+ .prologue 1
|
|
+#else
|
|
+ PROFCODE
|
|
+#endif
|
|
+
|
|
+ fclr a0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, 2 * SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ faddd a0, t0, a0
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ mov X, XX
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ ldi I, -1(I)
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ faddd a0, t0, a0
|
|
+ mov X, XX
|
|
+ fmuld x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ unop
|
|
+ fmuld x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ unop
|
|
+ fmuld x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ unop
|
|
+ fmuld x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ unop
|
|
+ fmuld x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x2, x2, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x3, x3, t3
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ faddd a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ ldi X, 2 * SIZE(X)
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 2, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ faddd a0, t0, a0
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fmuld x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ fmuld x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fmuld x3, x3, t3
|
|
+ unop
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ fmuld x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ fmuld x5, x5, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ faddd a2, t2, a2
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ fmuld x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ fmuld x7, x7, t3
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ faddd a0, t0, a0
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ fmuld x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x2, x2, t2
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x3, x3, t3
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x4, x4, t0
|
|
+
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x5, x5, t1
|
|
+ faddd a2, t2, a2
|
|
+ fmuld x6, x6, t2
|
|
+
|
|
+ faddd a3, t3, a3
|
|
+ fmuld x7, x7, t3
|
|
+ faddd a2, t2, a2
|
|
+ faddd a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 3, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ faddd a0, t0, a0
|
|
+ fmuld x0, x0, t0
|
|
+ faddd a1, t1, a1
|
|
+ fmuld x1, x1, t1
|
|
+
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ faddd a0, t0, a0
|
|
+ faddd a1, t1, a1
|
|
+
|
|
+ faddd a0, a1, a0
|
|
+ faddd a2, a3, a2
|
|
+
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ faddd a0, a2, $f16
|
|
+ jsr $26, ($27), sqrt !lituse_jsr!2
|
|
+
|
|
+ ldih $29, 0($26) !gpdisp!3
|
|
+ ldi $29, 0($29) !gpdisp!3
|
|
+#else
|
|
+ faddd a0, a2, a0
|
|
+ fsqrtd a0, a0
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+#if defined(EV4) || defined(EV5)
|
|
+ ldl $26, 0($sp)
|
|
+ ldi $sp, 16($sp)
|
|
+#endif
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/znrm2_simd.S b/kernel/sw_64/znrm2_simd.S
|
|
new file mode 100644
|
|
index 0000000..5a509d4
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/znrm2_simd.S
|
|
@@ -0,0 +1,492 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 80
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define XX $19
|
|
+
|
|
+#define I $0
|
|
+
|
|
+#define a0 $f0
|
|
+#define a1 $f1
|
|
+#define a2 $f10
|
|
+#define a3 $f11
|
|
+#define t0 $f12
|
|
+#define t1 $f13
|
|
+#define t2 $f14
|
|
+#define t3 $f15
|
|
+
|
|
+#define x0 $f16
|
|
+#define x1 $f17
|
|
+#define x2 $f18
|
|
+#define x3 $f19
|
|
+#define x4 $f20
|
|
+#define x5 $f21
|
|
+#define x6 $f22
|
|
+#define x7 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+
|
|
+ PROFCODE
|
|
+
|
|
+ fclr a0
|
|
+ sll INCX, ZBASE_SHIFT, INCX
|
|
+ fclr a1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr a2
|
|
+ cmpeq INCX, 2 * SIZE, $0
|
|
+ fclr a3
|
|
+ beq $0, $L20 #stride access
|
|
+
|
|
+
|
|
+/* test the address of X */
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ fclr t0
|
|
+ fclr t1
|
|
+ bne $3, $UnAlign_ACCESS
|
|
+/*Align access. Use simd instructions. Unloop 8 complex*/
|
|
+ sra N, 3, I
|
|
+ ble I, $Remain
|
|
+
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t0 #clear s0 vector
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t1
|
|
+
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t2
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+ vcpys $f31, $f31, t3
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ subl I, 1, I
|
|
+ nop
|
|
+ ble I, $MainLoopEnd
|
|
+$MainLoop:
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ VMAD a0, a0, t0, t0
|
|
+ subl I, 1, I
|
|
+ VMAD a1, a1, t1, t1
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ VMAD a2, a2, t2, t2
|
|
+ nop
|
|
+ VMAD a3, a3, t3, t3
|
|
+
|
|
+ VLD a0, -4*VEC_LEN*SIZE(X)
|
|
+ VLD a1, -3*VEC_LEN*SIZE(X)
|
|
+ VLD a2, -2*VEC_LEN*SIZE(X)
|
|
+ VLD a3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+$MainLoopEnd:
|
|
+ VMAD a0, a0, t0, t0
|
|
+ VMAD a1, a1, t1, t1
|
|
+ VMAD a2, a2, t2, t2
|
|
+ VMAD a3, a3, t3, t3
|
|
+
|
|
+ VADD t0, t1, a0
|
|
+ VADD t2, t3, a1
|
|
+ nop
|
|
+ VADD a0, a1, t0
|
|
+
|
|
+ vextf t0, 1, t1
|
|
+ vextf t0, 2, t2
|
|
+ vextf t0, 3, t3
|
|
+ nop
|
|
+
|
|
+ ADD t0, t1, a2
|
|
+ ADD t2, t3, a3
|
|
+ fclr t1
|
|
+ ADD a2, a3, t0
|
|
+
|
|
+ .align 4
|
|
+$Remain:
|
|
+ and N, 7, I
|
|
+ ble I, $End
|
|
+ .align 4
|
|
+$RemainLoop:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ addl X, 2*SIZE, X
|
|
+ MAD a0, a0, t0, t0
|
|
+ subl I, 1, I
|
|
+ MAD a1, a1, t1, t1
|
|
+
|
|
+ bgt I, $RemainLoop
|
|
+ .align 4
|
|
+
|
|
+ ADD t0, t1, t0
|
|
+$End:
|
|
+ SQRT t0, a0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS:
|
|
+
|
|
+ fclr t0
|
|
+ sra N, 3, I
|
|
+ fclr t1
|
|
+ ble I, $L15
|
|
+
|
|
+ fclr t2
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ LD x2, 2 * SIZE(X)
|
|
+ LD x3, 3 * SIZE(X)
|
|
+ LD x4, 4 * SIZE(X)
|
|
+ LD x5, 5 * SIZE(X)
|
|
+ LD x6, 6 * SIZE(X)
|
|
+ LD x7, 7 * SIZE(X)
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+ ADD a0, t0, a0
|
|
+ fillcs (PREFETCHSIZE) * SIZE(X)
|
|
+ MUL x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ mov X, XX
|
|
+ MUL x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ unop
|
|
+ MUL x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ unop
|
|
+ MUL x4, x4, t0
|
|
+ LD x4, 12 * SIZE(X)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ unop
|
|
+ MUL x5, x5, t1
|
|
+ LD x5, 13 * SIZE(X)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x6, x6, t2
|
|
+ LD x6, 14 * SIZE(X)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ unop
|
|
+ MUL x7, x7, t3
|
|
+ LD x7, 15 * SIZE(X)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ unop
|
|
+ MUL x0, x0, t0
|
|
+ LD x0, 16 * SIZE(X)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ MUL x1, x1, t1
|
|
+ LD x1, 17 * SIZE(XX)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x2, x2, t2
|
|
+ LD x2, 18 * SIZE(XX)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ unop
|
|
+ MUL x3, x3, t3
|
|
+ LD x3, 19 * SIZE(XX)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ unop
|
|
+ MUL x4, x4, t0
|
|
+ LD x4, 20 * SIZE(XX)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ ldi I, -1(I)
|
|
+ MUL x5, x5, t1
|
|
+ LD x5, 21 * SIZE(XX)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x6, x6, t2
|
|
+ LD x6, 22 * SIZE(XX)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ MUL x7, x7, t3
|
|
+ LD x7, 23 * SIZE(XX)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD a0, t0, a0
|
|
+ mov X, XX
|
|
+ MUL x0, x0, t0
|
|
+ LD x0, 8 * SIZE(X)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ unop
|
|
+ MUL x1, x1, t1
|
|
+ LD x1, 9 * SIZE(X)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x2, x2, t2
|
|
+ LD x2, 10 * SIZE(X)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ unop
|
|
+ MUL x3, x3, t3
|
|
+ LD x3, 11 * SIZE(X)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ unop
|
|
+ MUL x4, x4, t0
|
|
+ LD x4, 12 * SIZE(XX)
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ unop
|
|
+ MUL x5, x5, t1
|
|
+ LD x5, 13 * SIZE(XX)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ unop
|
|
+ MUL x6, x6, t2
|
|
+ LD x6, 14 * SIZE(XX)
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ ldi X, 16 * SIZE(X)
|
|
+ MUL x7, x7, t3
|
|
+ LD x7, 15 * SIZE(XX)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x0, x0, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL x1, x1, t1
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ MUL x2, x2, t2
|
|
+ ADD a3, t3, a3
|
|
+ MUL x3, x3, t3
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x4, x4, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL x5, x5, t1
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ MUL x6, x6, t2
|
|
+ ADD a3, t3, a3
|
|
+ MUL x7, x7, t3
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ ADD a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 7, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+
|
|
+ ldi X, 2 * SIZE(X)
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x0, x0, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL x1, x1, t1
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L16
|
|
+ bsr $31, $L998
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ fclr t0
|
|
+ sra N, 2, I
|
|
+ fclr t1
|
|
+ ble I, $L25
|
|
+
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ ble I, $L22
|
|
+ .align 4
|
|
+
|
|
+$L21:
|
|
+ ADD a0, t0, a0
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ MUL x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ MUL x1, x1, t1
|
|
+ unop
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ MUL x2, x2, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ LD x2, 0 * SIZE(X)
|
|
+ MUL x3, x3, t3
|
|
+ unop
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ LD x3, 1 * SIZE(X)
|
|
+ MUL x4, x4, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ LD x4, 0 * SIZE(X)
|
|
+ MUL x5, x5, t1
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD a2, t2, a2
|
|
+ LD x5, 1 * SIZE(X)
|
|
+ MUL x6, x6, t2
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ LD x6, 0 * SIZE(X)
|
|
+ MUL x7, x7, t3
|
|
+ bgt I, $L21
|
|
+ .align 4
|
|
+
|
|
+$L22:
|
|
+ ADD a0, t0, a0
|
|
+ LD x7, 1 * SIZE(X)
|
|
+ MUL x0, x0, t0
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ MUL x1, x1, t1
|
|
+ ADD a2, t2, a2
|
|
+ MUL x2, x2, t2
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ MUL x3, x3, t3
|
|
+ ADD a0, t0, a0
|
|
+ MUL x4, x4, t0
|
|
+
|
|
+ ADD a1, t1, a1
|
|
+ MUL x5, x5, t1
|
|
+ ADD a2, t2, a2
|
|
+ MUL x6, x6, t2
|
|
+
|
|
+ ADD a3, t3, a3
|
|
+ MUL x7, x7, t3
|
|
+ ADD a2, t2, a2
|
|
+ ADD a3, t3, a3
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ and N, 3, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L26:
|
|
+ LD x0, 0 * SIZE(X)
|
|
+ ldi I, -1(I)
|
|
+ LD x1, 1 * SIZE(X)
|
|
+ addl X, INCX, X
|
|
+
|
|
+ ADD a0, t0, a0
|
|
+ MUL x0, x0, t0
|
|
+ ADD a1, t1, a1
|
|
+ MUL x1, x1, t1
|
|
+
|
|
+ bgt I, $L26
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L998:
|
|
+ ADD a0, t0, a0
|
|
+ ADD a1, t1, a1
|
|
+
|
|
+ ADD a0, a1, a0
|
|
+ ADD a2, a3, a2
|
|
+
|
|
+
|
|
+
|
|
+ ADD a0, a2, a0
|
|
+ SQRT a0, a0
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zrot.S b/kernel/sw_64/zrot.S
|
|
new file mode 100644
|
|
index 0000000..9016a00
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zrot.S
|
|
@@ -0,0 +1,689 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define I $21
|
|
+#define XX $23
|
|
+#define YY $24
|
|
+
|
|
+#define b9 $f29
|
|
+
|
|
+#define C $f10
|
|
+#define S $f11
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fmov $f21, C
|
|
+ LD S, 0($sp)
|
|
+
|
|
+ addl INCX, INCX, INCX
|
|
+ addl INCY, INCY, INCY
|
|
+
|
|
+ cmpeq INCX, 2, $23
|
|
+ cmpeq INCY, 2, $24
|
|
+ ble N, $L998
|
|
+
|
|
+ and $23, $24, $23
|
|
+ beq $23, $L50
|
|
+
|
|
+ sra N, 2, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ LD $f16, 2*SIZE(X)
|
|
+ LD $f17, 2*SIZE(Y)
|
|
+ LD $f18, 3*SIZE(X)
|
|
+ LD $f19, 3*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+
|
|
+ LD $f13, 4*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ MUL C, $f14, $f25
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ MUL S, $f15, $f26
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+ MUL C, $f15, $f27
|
|
+
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ MUL C, $f16, $f21
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ unop
|
|
+ LD $f14, 5*SIZE(X)
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(Y)
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 8*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 8*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 9*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 9*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ LD $f17, 10*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 10*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ LD $f19, 11*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ ldi I, -1(I)
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 11*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ MUL S, $f13, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 12*SIZE(Y)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ unop
|
|
+
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ unop
|
|
+
|
|
+ ST $f26, -1*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, -1*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 5*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ ST $f26, 7*SIZE(X)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ ST $f28, 7*SIZE(Y)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L15:
|
|
+ and N, 3, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ ldi X, 2 * SIZE(X)
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ ldi Y, 2 * SIZE(Y)
|
|
+
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ mov X, XX
|
|
+ mov Y, YY
|
|
+
|
|
+ sra N, 2, I
|
|
+ ble I, $L55
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ and N, 3, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L56:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, b9
|
|
+ fmov b9, $f22
|
|
+ SUB $f23, $f24, b9
|
|
+ fmov b9, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, b9
|
|
+ fmov b9, $f26
|
|
+ SUB $f27, $f28, b9
|
|
+ fmov b9, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ bgt I, $L56
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zrot.S.bak b/kernel/sw_64/zrot.S.bak
|
|
new file mode 100644
|
|
index 0000000..83dd2b1
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zrot.S.bak
|
|
@@ -0,0 +1,631 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define I $21
|
|
+#define XX $23
|
|
+#define YY $24
|
|
+
|
|
+#define C $f10
|
|
+#define S $f11
|
|
+
|
|
+#define PREFETCH_SIZE 80
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fmov $f21, C
|
|
+ LD S, 0($sp)
|
|
+
|
|
+ addl INCX, INCX, INCX
|
|
+ addl INCY, INCY, INCY
|
|
+
|
|
+ cmpeq INCX, 2, $23
|
|
+ cmpeq INCY, 2, $24
|
|
+ ble N, $L998
|
|
+
|
|
+ and $23, $24, $23
|
|
+ beq $23, $L50
|
|
+
|
|
+ sra N, 2, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ LD $f16, 2*SIZE(X)
|
|
+ LD $f17, 2*SIZE(Y)
|
|
+ LD $f18, 3*SIZE(X)
|
|
+ LD $f19, 3*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+
|
|
+ LD $f13, 4*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ MUL C, $f14, $f25
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ MUL S, $f15, $f26
|
|
+ ADD $f21, $f22, $f22
|
|
+ MUL C, $f15, $f27
|
|
+
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f24
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ MUL C, $f16, $f21
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(X)
|
|
+ unop
|
|
+ LD $f14, 5*SIZE(X)
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ fillcs (PREFETCH_SIZE) * SIZE(Y)
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 8*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 8*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 9*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 9*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ LD $f17, 10*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 10*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ LD $f19, 11*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ ldi I, -1(I)
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 11*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ MUL S, $f13, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 12*SIZE(Y)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ unop
|
|
+
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ unop
|
|
+
|
|
+ ST $f26, -1*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, -1*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f24
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 5*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ ADD $f25, $f26, $f26
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 7*SIZE(X)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ ST $f28, 7*SIZE(Y)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L15:
|
|
+ and N, 3, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ ldi X, 2 * SIZE(X)
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ ldi Y, 2 * SIZE(Y)
|
|
+
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ mov X, XX
|
|
+ mov Y, YY
|
|
+
|
|
+ sra N, 2, I
|
|
+ ble I, $L55
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ and N, 3, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L56:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ bgt I, $L56
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zrot_simd.S b/kernel/sw_64/zrot_simd.S
|
|
new file mode 100644
|
|
index 0000000..9e00ebf
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zrot_simd.S
|
|
@@ -0,0 +1,799 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define Y $19
|
|
+#define INCY $20
|
|
+#define I $21
|
|
+#define XX $23
|
|
+#define YY $24
|
|
+
|
|
+#define C $f10
|
|
+#define S $f11
|
|
+
|
|
+#define x0 $f12
|
|
+#define x1 $f14
|
|
+#define x2 $f16
|
|
+#define x3 $f18
|
|
+
|
|
+#define y0 $f13
|
|
+#define y1 $f15
|
|
+#define y2 $f17
|
|
+#define y3 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+#define t4 $f24
|
|
+#define t5 $f25
|
|
+#define t6 $f26
|
|
+#define t7 $f27
|
|
+
|
|
+#define PREFETCHSIZE 80
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ fmov $f21, C
|
|
+ LD S, 0($sp)
|
|
+
|
|
+ addl INCX, INCX, INCX
|
|
+ addl INCY, INCY, INCY
|
|
+
|
|
+ cmpeq INCX, 2, $23
|
|
+ cmpeq INCY, 2, $24
|
|
+ ble N, $L998
|
|
+
|
|
+ and $23, $24, $23
|
|
+ beq $23, $L50
|
|
+
|
|
+/* test the address of X */
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ and Y, (VEC_LEN*SIZE-1), $4
|
|
+ or $3, $4, $4
|
|
+ bne $4, $UnAlign_ACCESS
|
|
+
|
|
+/*Align Accessing*/
|
|
+ sra N, 3, I
|
|
+ ble I, $Remain
|
|
+
|
|
+ vcpyf C, C
|
|
+ vcpyf S, S
|
|
+
|
|
+ VLD x0, 0*VEC_LEN*SIZE(X)
|
|
+ VLD x1, 1*VEC_LEN*SIZE(X)
|
|
+ VLD x2, 2*VEC_LEN*SIZE(X)
|
|
+ VLD x3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VLD y0, 0*VEC_LEN*SIZE(Y)
|
|
+ VLD y1, 1*VEC_LEN*SIZE(Y)
|
|
+ VLD y2, 2*VEC_LEN*SIZE(Y)
|
|
+ VLD y3, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ subl I, 1, I
|
|
+ ble I, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ VMUL C, x0, t0
|
|
+ fillcs (PREFETCHSIZE) * SIZE(X)
|
|
+ VMUL C, x1, t1
|
|
+ fillcs (PREFETCHSIZE) * SIZE(Y)
|
|
+
|
|
+ VMUL C, x2, t2
|
|
+ subl I, 1, I
|
|
+ VMUL C, x3, t3
|
|
+ nop
|
|
+
|
|
+ VMUL S, x0, t4
|
|
+ VLD x0, 0*VEC_LEN*SIZE(X)
|
|
+ VMUL S, x1, t5
|
|
+ VLD x1, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VMUL S, x2, t6
|
|
+ VLD x2, 2*VEC_LEN*SIZE(X)
|
|
+ VMUL S, x3, t7
|
|
+ VLD x3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VMAD S, y0, t0, t0
|
|
+ VMAD S, y1, t1, t1
|
|
+ VMAD S, y2, t2, t2
|
|
+ VMAD S, y3, t3, t3
|
|
+
|
|
+ VMSUB C, y0, t4, t4
|
|
+ VLD y0, 0*VEC_LEN*SIZE(Y)
|
|
+ VMSUB C, y1, t5, t5
|
|
+ VLD y1, 1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VMSUB C, y2, t6, t6
|
|
+ VLD y2, 2*VEC_LEN*SIZE(Y)
|
|
+ VMSUB C, y3, t7, t7
|
|
+ VLD y3, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VST t0, -4*VEC_LEN*SIZE(X)
|
|
+ VST t1, -3*VEC_LEN*SIZE(X)
|
|
+ VST t2, -2*VEC_LEN*SIZE(X)
|
|
+ VST t3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VST t4, -4*VEC_LEN*SIZE(Y)
|
|
+ VST t5, -3*VEC_LEN*SIZE(Y)
|
|
+ VST t6, -2*VEC_LEN*SIZE(Y)
|
|
+ VST t7, -1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ addl X, 16 * SIZE, X
|
|
+ addl Y, 16 * SIZE, Y
|
|
+ nop
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+$MainLoopEnd:
|
|
+ VMUL C, x0, t0
|
|
+ VMUL C, x1, t1
|
|
+ VMUL C, x2, t2
|
|
+ VMUL C, x3, t3
|
|
+
|
|
+ VMUL S, x0, t4
|
|
+ VMUL S, x1, t5
|
|
+ VMUL S, x2, t6
|
|
+ VMUL S, x3, t7
|
|
+
|
|
+ VMAD S, y0, t0, t0
|
|
+ VMAD S, y1, t1, t1
|
|
+ VMAD S, y2, t2, t2
|
|
+ VMAD S, y3, t3, t3
|
|
+
|
|
+ VMSUB C, y0, t4, t4
|
|
+ VMSUB C, y1, t5, t5
|
|
+ VMSUB C, y2, t6, t6
|
|
+ VMSUB C, y3, t7, t7
|
|
+
|
|
+ VST t0, -4*VEC_LEN*SIZE(X)
|
|
+ VST t1, -3*VEC_LEN*SIZE(X)
|
|
+ VST t2, -2*VEC_LEN*SIZE(X)
|
|
+ VST t3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VST t4, -4*VEC_LEN*SIZE(Y)
|
|
+ VST t5, -3*VEC_LEN*SIZE(Y)
|
|
+ VST t6, -2*VEC_LEN*SIZE(Y)
|
|
+ VST t7, -1*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ .align 4
|
|
+$Remain:
|
|
+ and N, 7, I
|
|
+ ble I, $End
|
|
+$RemainLoop:
|
|
+ LD x0, 0*SIZE(X)
|
|
+ LD y0, 0*SIZE(Y)
|
|
+ LD x1, 1*SIZE(X)
|
|
+ LD y1, 1*SIZE(Y)
|
|
+
|
|
+ MUL C, x0, t0
|
|
+ MUL S, x0, t4
|
|
+ MAD S, y0, t0, t0
|
|
+ MSUB C, y0, t4, t4
|
|
+
|
|
+ MUL C, x1, t1
|
|
+ ldi I, -1(I)
|
|
+ MUL S, x1, t5
|
|
+ ldi X, 2 * SIZE(X)
|
|
+
|
|
+ MAD S, y1, t1, t1
|
|
+ ldi Y, 2 * SIZE(Y)
|
|
+ MSUB C, y1, t5, t5
|
|
+ nop
|
|
+
|
|
+ ST t0, -2*SIZE(X)
|
|
+ ST t1, -1*SIZE(X)
|
|
+ ST t4, -2*SIZE(Y)
|
|
+ ST t5, -1*SIZE(Y)
|
|
+
|
|
+ bgt I, $RemainLoop
|
|
+ .align 4
|
|
+$End:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS:
|
|
+ sra N, 2, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ LD $f16, 2*SIZE(X)
|
|
+ LD $f17, 2*SIZE(Y)
|
|
+ LD $f18, 3*SIZE(X)
|
|
+ LD $f19, 3*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+
|
|
+ LD $f13, 4*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ MUL C, $f14, $f25
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ MUL S, $f15, $f26
|
|
+ ADD $f21, $f22, $f22
|
|
+ MUL C, $f15, $f27
|
|
+
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f24
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ MUL C, $f16, $f21
|
|
+ fillcs (PREFETCHSIZE) * SIZE(X)
|
|
+ unop
|
|
+ LD $f14, 5*SIZE(X)
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ fillcs (PREFETCHSIZE) * SIZE(Y)
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 8*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 8*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 9*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 9*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ LD $f17, 10*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ LD $f16, 10*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ LD $f19, 11*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ ldi I, -1(I)
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ LD $f18, 11*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ MUL S, $f13, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ LD $f13, 12*SIZE(Y)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ unop
|
|
+
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ LD $f12, 4*SIZE(X)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ unop
|
|
+
|
|
+ ST $f26, -1*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, -1*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ SUB $f23, $f24, $f24
|
|
+ bgt I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL C, $f16, $f21
|
|
+ LD $f14, 5*SIZE(X)
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ LD $f17, 6*SIZE(Y)
|
|
+
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ LD $f16, 6*SIZE(X)
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ LD $f19, 7*SIZE(Y)
|
|
+
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ LD $f18, 7*SIZE(X)
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 2*SIZE(X)
|
|
+ unop
|
|
+ MUL S, $f13, $f22
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f13, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 2*SIZE(Y)
|
|
+ MUL S, $f12, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 3*SIZE(X)
|
|
+ MUL S, $f15, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f15, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 3*SIZE(Y)
|
|
+ MUL S, $f14, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f16, $f21
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f22, 4*SIZE(X)
|
|
+ MUL S, $f17, $f22
|
|
+ unop
|
|
+ ADD $f25, $f26, $f26
|
|
+
|
|
+ MUL C, $f17, $f23
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ MUL S, $f16, $f24
|
|
+ unop
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ MUL C, $f18, $f25
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f26, 5*SIZE(X)
|
|
+ MUL S, $f19, $f26
|
|
+ unop
|
|
+ ADD $f21, $f22, $f22
|
|
+
|
|
+ MUL C, $f19, $f27
|
|
+ unop
|
|
+ unop
|
|
+ unop
|
|
+
|
|
+ ST $f28, 5*SIZE(Y)
|
|
+ MUL S, $f18, $f28
|
|
+ unop
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ ST $f22, 6*SIZE(X)
|
|
+ ADD $f25, $f26, $f26
|
|
+ ST $f24, 6*SIZE(Y)
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f26, 7*SIZE(X)
|
|
+ ldi X, 8*SIZE(X)
|
|
+ ST $f28, 7*SIZE(Y)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$L15:
|
|
+ and N, 3, I
|
|
+ ble I, $L998
|
|
+ .align 4
|
|
+
|
|
+$L16:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ ldi X, 2 * SIZE(X)
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ ldi Y, 2 * SIZE(Y)
|
|
+
|
|
+ bgt I, $L16
|
|
+ .align 4
|
|
+
|
|
+$L998:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ mov X, XX
|
|
+ mov Y, YY
|
|
+
|
|
+ sra N, 2, I
|
|
+ ble I, $L55
|
|
+ .align 4
|
|
+
|
|
+$L51:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(XX)
|
|
+ ST $f24, 0*SIZE(YY)
|
|
+ ST $f26, 1*SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ST $f28, 1*SIZE(YY)
|
|
+ SXADDQ INCY, YY, YY
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L51
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ and N, 3, I
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L56:
|
|
+ LD $f12, 0*SIZE(X)
|
|
+ LD $f13, 0*SIZE(Y)
|
|
+ LD $f14, 1*SIZE(X)
|
|
+ LD $f15, 1*SIZE(Y)
|
|
+
|
|
+ MUL C, $f12, $f21
|
|
+ MUL S, $f13, $f22
|
|
+ MUL C, $f13, $f23
|
|
+ MUL S, $f12, $f24
|
|
+
|
|
+ ADD $f21, $f22, $f22
|
|
+ SUB $f23, $f24, $f24
|
|
+
|
|
+ MUL C, $f14, $f25
|
|
+ MUL S, $f15, $f26
|
|
+ MUL C, $f15, $f27
|
|
+ MUL S, $f14, $f28
|
|
+
|
|
+ ADD $f25, $f26, $f26
|
|
+ SUB $f27, $f28, $f28
|
|
+
|
|
+ ST $f22, 0*SIZE(X)
|
|
+ ST $f24, 0*SIZE(Y)
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ST $f26, 1*SIZE(X)
|
|
+ ST $f28, 1*SIZE(Y)
|
|
+ SXADDQ INCX, X, X
|
|
+ SXADDQ INCY, Y, Y
|
|
+
|
|
+ bgt I, $L56
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zscal.S b/kernel/sw_64/zscal.S
|
|
new file mode 100644
|
|
index 0000000..9589624
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zscal.S
|
|
@@ -0,0 +1,255 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $21
|
|
+#define INCX $17
|
|
+
|
|
+#define XX $18
|
|
+#define I $19
|
|
+
|
|
+#define ALPHA_R $f19
|
|
+#define ALPHA_I $f20
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f21
|
|
+
|
|
+#define t0 $f22
|
|
+#define t1 $f23
|
|
+#define t2 $f24
|
|
+#define t3 $f25
|
|
+
|
|
+#define t4 $f26
|
|
+#define t5 $f27
|
|
+#define t6 $f28
|
|
+#define t7 $f29
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ ldl INCX, 0($sp)
|
|
+ mov X, XX
|
|
+ ble N, $L999
|
|
+
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ sra N, 2, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ MUL a1, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ MUL a3, ALPHA_I, t1
|
|
+
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a3, ALPHA_R, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA_I, t1
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a4, ALPHA_I, t2
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a5, ALPHA_R, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t5
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA_I, t1
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a7, ALPHA_R, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a1, ALPHA_R, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ldi I, -1(I)
|
|
+ ADD t2, t3, t5
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ MUL a3, ALPHA_I, t1
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ MUL a3, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ unop
|
|
+
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+ MUL a5, ALPHA_I, t1
|
|
+ MUL a4, ALPHA_I, t2
|
|
+ MUL a5, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t5
|
|
+ unop
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA_I, t1
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ MUL a7, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 3, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ MUL a1, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zscal.S.bak b/kernel/sw_64/zscal.S.bak
|
|
new file mode 100644
|
|
index 0000000..4525b56
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zscal.S.bak
|
|
@@ -0,0 +1,443 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $21
|
|
+#define INCX $17
|
|
+
|
|
+#define XX $18
|
|
+#define I $19
|
|
+
|
|
+#define ALPHA_R $f19
|
|
+#define ALPHA_I $f20
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f21
|
|
+
|
|
+#define t0 $f22
|
|
+#define t1 $f23
|
|
+#define t2 $f24
|
|
+#define t3 $f25
|
|
+
|
|
+#define t4 $f26
|
|
+#define t5 $f27
|
|
+#define t6 $f28
|
|
+#define t7 $f29
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+ ldl INCX, 0($sp)
|
|
+ mov X, XX
|
|
+ cmpeq INCX, 1, $0
|
|
+ ble N, $L999
|
|
+
|
|
+ beq $0, $Sub
|
|
+ nop
|
|
+
|
|
+/*
|
|
+ unloop 4 (4*2=8)
|
|
+*/
|
|
+ sra N, 2, I
|
|
+ ble I, $Remain
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ LD a5, 5 * SIZE(X)
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ LD a7, 7 * SIZE(X)
|
|
+
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a0, ALPHA_I, t2
|
|
+
|
|
+ NMAD a1, ALPHA_I, t0, t4
|
|
+ MAD a1, ALPHA_R, t2, t5
|
|
+/*
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a1, ALPHA_R, t3
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+*/
|
|
+ ldi I, -1(I)
|
|
+ addl X, 8*SIZE, X
|
|
+
|
|
+ ble I, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ ST t4, -8 * SIZE(X)
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ ST t5, -7 * SIZE(X)
|
|
+
|
|
+
|
|
+ NMAD a3, ALPHA_I, t0, t6
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MAD a3, ALPHA_R, t2, t7
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ST t6, -6 * SIZE(X)
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t7, -5 * SIZE(X)
|
|
+ MUL a4, ALPHA_I, t2
|
|
+
|
|
+
|
|
+ NMAD a5, ALPHA_I, t0, t4
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ MAD a5, ALPHA_R, t2, t5
|
|
+ LD a3, 3 * SIZE(X)
|
|
+/*
|
|
+ MUL a5, ALPHA_I, t1
|
|
+ MUL a5, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+*/
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, -4 * SIZE(X)
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ ST t5, -3 * SIZE(X)
|
|
+
|
|
+ NMAD a7, ALPHA_I, t0, t6
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ MAD a7, ALPHA_R, t2, t7
|
|
+ LD a5, 5 * SIZE(X)
|
|
+/*
|
|
+
|
|
+ MUL a7, ALPHA_I, t1
|
|
+ MUL a7, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ ADD t2, t3, t7
|
|
+*/
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ ST t6, -2 * SIZE(X)
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ ST t7, -1 * SIZE(X)
|
|
+
|
|
+ NMAD a1, ALPHA_I, t0, t4
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ MAD a1, ALPHA_R, t2, t5
|
|
+ LD a7, 7 * SIZE(X)
|
|
+
|
|
+
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ addl X, 8*SIZE, X
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ ST t4, -8 * SIZE(X)
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ ST t5, -7 * SIZE(X)
|
|
+
|
|
+
|
|
+ NMAD a3, ALPHA_I, t0, t6
|
|
+ MAD a3, ALPHA_R, t2, t7
|
|
+
|
|
+
|
|
+ ST t6, -6 * SIZE(X)
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t7, -5 * SIZE(X)
|
|
+ MUL a4, ALPHA_I, t2
|
|
+
|
|
+
|
|
+ NMAD a5, ALPHA_I, t0, t4
|
|
+ MAD a5, ALPHA_R, t2, t5
|
|
+/*
|
|
+ MUL a5, ALPHA_I, t1
|
|
+ MUL a5, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+*/
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, -4 * SIZE(X)
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ ST t5, -3 * SIZE(X)
|
|
+
|
|
+ NMAD a7, ALPHA_I, t0, t6
|
|
+ MAD a7, ALPHA_R, t2, t7
|
|
+/*
|
|
+
|
|
+ MUL a7, ALPHA_I, t1
|
|
+ MUL a7, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ ADD t2, t3, t7
|
|
+*/
|
|
+ ST t6, -2 * SIZE(X)
|
|
+ ST t7, -1 * SIZE(X)
|
|
+
|
|
+ .align 4
|
|
+$Remain:
|
|
+ and N, 3, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$RemainLoop:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a0, ALPHA_I, t2
|
|
+
|
|
+ NMAD a1, ALPHA_I, t0, t4
|
|
+ MAD a1, ALPHA_R, t2, t5
|
|
+
|
|
+/*
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a1, ALPHA_R, t3
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+*/
|
|
+ ST t4, 0 * SIZE(X)
|
|
+ ST t5, 1 * SIZE(X)
|
|
+
|
|
+ addl X, 2*SIZE, X
|
|
+ ldi I, -1(I)
|
|
+ bne I, $RemainLoop
|
|
+ nop
|
|
+
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ sra N, 2, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ MUL a1, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ MUL a3, ALPHA_I, t1
|
|
+
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a3, ALPHA_R, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA_I, t1
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a4, ALPHA_I, t2
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a5, ALPHA_R, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t5
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA_I, t1
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a7, ALPHA_R, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a1, ALPHA_R, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ldi I, -1(I)
|
|
+ ADD t2, t3, t5
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ MUL a3, ALPHA_I, t1
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ MUL a3, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ unop
|
|
+
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+ MUL a5, ALPHA_I, t1
|
|
+ MUL a4, ALPHA_I, t2
|
|
+ MUL a5, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t5
|
|
+ unop
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA_I, t1
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ MUL a7, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 3, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ MUL a1, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zscal_simd.S b/kernel/sw_64/zscal_simd.S
|
|
new file mode 100644
|
|
index 0000000..09d2f38
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zscal_simd.S
|
|
@@ -0,0 +1,579 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 96
|
|
+
|
|
+#define N $16
|
|
+#define X $21
|
|
+#define INCX $17
|
|
+
|
|
+#define XX $18
|
|
+#define I $19
|
|
+
|
|
+#define ALPHA_R $f19
|
|
+#define ALPHA_I $f20
|
|
+
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f21
|
|
+
|
|
+#define t0 $f22
|
|
+#define t1 $f23
|
|
+#define t2 $f24
|
|
+#define t3 $f25
|
|
+
|
|
+#define t4 $f26
|
|
+#define t5 $f27
|
|
+#define t6 $f28
|
|
+#define t7 $f29
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+ ldl INCX, 0($sp)
|
|
+ mov X, XX
|
|
+ cmpeq INCX, 1, $0
|
|
+ ble N, $L999
|
|
+
|
|
+ beq $0, $Sub
|
|
+ .align 5
|
|
+
|
|
+ and X, (VEC_LEN*SIZE-1), $6
|
|
+ bgt $6, $UnAlign_X_ACCESS
|
|
+
|
|
+/*
|
|
+ Unloop 8 (8*2=16)
|
|
+*/
|
|
+ sra N, 3, I
|
|
+ vcpyf ALPHA_R, ALPHA_R
|
|
+ vcpyf ALPHA_I, ALPHA_I
|
|
+ ble I, $Remain
|
|
+
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ subl I, 1, I
|
|
+ addl X, 16*SIZE, X
|
|
+ ble I, $MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+
|
|
+$MainLoop:
|
|
+
|
|
+ vextf a0, 1, a4
|
|
+ vextf a0, 3, a5
|
|
+ vextf a1, 0, a6
|
|
+ vextf a1, 2, a7
|
|
+
|
|
+ vextf a2, 1, t0
|
|
+ vextf a2, 3, t1
|
|
+ vextf a3, 0, t2
|
|
+ vextf a3, 2, t3
|
|
+
|
|
+ vinsf a4, a1, 0, a1
|
|
+ vinsf a5, a1, 2, a1
|
|
+ vinsf a6, a0, 1, a0
|
|
+ vinsf a7, a0, 3, a0
|
|
+
|
|
+ vinsf t0, a3, 0, a3
|
|
+ vinsf t1, a3, 2, a3
|
|
+ vinsf t2, a2, 1, a2
|
|
+ vinsf t3, a2, 3, a2
|
|
+
|
|
+ VMUL ALPHA_R, a0, t4
|
|
+ VMUL ALPHA_I, a0, t5
|
|
+ VMUL ALPHA_R, a2, t6
|
|
+ VMUL ALPHA_I, a2, t7
|
|
+
|
|
+ VNMAD ALPHA_I, a1, t4, t0
|
|
+ VLD a0, 0*VEC_LEN*SIZE(X)
|
|
+ VMAD ALPHA_R, a1, t5, t1
|
|
+ VLD a1, 1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VNMAD ALPHA_I, a3, t6, t2
|
|
+ VLD a2, 2*VEC_LEN*SIZE(X)
|
|
+ VMAD ALPHA_R, a3, t7, t3
|
|
+ VLD a3, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/
|
|
+ vextf t0, 1, a4
|
|
+ vextf t0, 3, a5
|
|
+ vextf t1, 0, a6
|
|
+ vextf t1, 2, a7
|
|
+
|
|
+ vextf t2, 1, s0
|
|
+ vextf t2, 3, s1
|
|
+ vextf t3, 0, s2
|
|
+ vextf t3, 2, s3
|
|
+
|
|
+ vinsf a4, t1, 0, t1
|
|
+ vinsf a5, t1, 2, t1
|
|
+ vinsf a6, t0, 1, t0
|
|
+ vinsf a7, t0, 3, t0
|
|
+
|
|
+ vinsf s0, t3, 0, t3
|
|
+ vinsf s1, t3, 2, t3
|
|
+ vinsf s2, t2, 1, t2
|
|
+ vinsf s3, t2, 3, t2
|
|
+
|
|
+ VST t0, -4*VEC_LEN*SIZE(X)
|
|
+ VST t1, -3*VEC_LEN*SIZE(X)
|
|
+ VST t2, -2*VEC_LEN*SIZE(X)
|
|
+ VST t3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ addl X, 16*SIZE, X
|
|
+ bgt I, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainLoopEnd:
|
|
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
|
|
+ vextf a0, 1, a4
|
|
+ vextf a0, 3, a5
|
|
+ vextf a1, 0, a6
|
|
+ vextf a1, 2, a7
|
|
+
|
|
+ vextf a2, 1, t0
|
|
+ vextf a2, 3, t1
|
|
+ vextf a3, 0, t2
|
|
+ vextf a3, 2, t3
|
|
+
|
|
+ vinsf a4, a1, 0, a1
|
|
+ vinsf a5, a1, 2, a1
|
|
+ vinsf a6, a0, 1, a0
|
|
+ vinsf a7, a0, 3, a0
|
|
+
|
|
+ vinsf t0, a3, 0, a3
|
|
+ vinsf t1, a3, 2, a3
|
|
+ vinsf t2, a2, 1, a2
|
|
+ vinsf t3, a2, 3, a2
|
|
+
|
|
+ VMUL ALPHA_R, a0, t4
|
|
+ VMUL ALPHA_I, a0, t5
|
|
+ VMUL ALPHA_R, a2, t6
|
|
+ VMUL ALPHA_I, a2, t7
|
|
+
|
|
+ VNMAD ALPHA_I, a1, t4, t0
|
|
+ VMAD ALPHA_R, a1, t5, t1
|
|
+ VNMAD ALPHA_I, a3, t6, t2
|
|
+ VMAD ALPHA_R, a3, t7, t3
|
|
+
|
|
+/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/
|
|
+ vextf t0, 1, a4
|
|
+ vextf t0, 3, a5
|
|
+ vextf t1, 0, a6
|
|
+ vextf t1, 2, a7
|
|
+
|
|
+ vextf t2, 1, s0
|
|
+ vextf t2, 3, s1
|
|
+ vextf t3, 0, s2
|
|
+ vextf t3, 2, s3
|
|
+
|
|
+ vinsf a4, t1, 0, t1
|
|
+ vinsf a5, t1, 2, t1
|
|
+ vinsf a6, t0, 1, t0
|
|
+ vinsf a7, t0, 3, t0
|
|
+
|
|
+ vinsf s0, t3, 0, t3
|
|
+ vinsf s1, t3, 2, t3
|
|
+ vinsf s2, t2, 1, t2
|
|
+ vinsf s3, t2, 3, t2
|
|
+
|
|
+ VST t0, -4*VEC_LEN*SIZE(X)
|
|
+ VST t1, -3*VEC_LEN*SIZE(X)
|
|
+ VST t2, -2*VEC_LEN*SIZE(X)
|
|
+ VST t3, -1*VEC_LEN*SIZE(X)
|
|
+
|
|
+$Remain:
|
|
+ and N, 7, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 5
|
|
+
|
|
+$Remain_loop:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ MUL a1, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+ ST t4, 0 * SIZE(X)
|
|
+ ST t5, 1 * SIZE(X)
|
|
+
|
|
+ addl X, 2*SIZE, X
|
|
+ ldi I, -1(I)
|
|
+ bne I, $Remain_loop
|
|
+ ret
|
|
+ .align 5
|
|
+
|
|
+$UnAlign_X_ACCESS:
|
|
+/*
|
|
+ unloop 4 (4*2=8)
|
|
+*/
|
|
+ sra N, 2, I
|
|
+ ble I, $Unalign_Remain
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ LD a5, 5 * SIZE(X)
|
|
+ MUL a0, ALPHA_I, t2
|
|
+
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ NMAD a1, ALPHA_I, t0, t4
|
|
+ LD a7, 7 * SIZE(X)
|
|
+ MAD a1, ALPHA_R, t2, t5
|
|
+
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ addl X, 8*SIZE, X
|
|
+ ble I, $Unalign_MainLoopEnd
|
|
+ .align 4
|
|
+
|
|
+$Unalign_MainLoop:
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ ST t4, -8 * SIZE(X)
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ ST t5, -7 * SIZE(X)
|
|
+
|
|
+
|
|
+ NMAD a3, ALPHA_I, t0, t6
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MAD a3, ALPHA_R, t2, t7
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ ST t6, -6 * SIZE(X)
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t7, -5 * SIZE(X)
|
|
+ MUL a4, ALPHA_I, t2
|
|
+
|
|
+
|
|
+ NMAD a5, ALPHA_I, t0, t4
|
|
+ LD a2, 2 * SIZE(X)
|
|
+ MAD a5, ALPHA_R, t2, t5
|
|
+ LD a3, 3 * SIZE(X)
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, -4 * SIZE(X)
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ ST t5, -3 * SIZE(X)
|
|
+
|
|
+ NMAD a7, ALPHA_I, t0, t6
|
|
+ LD a4, 4 * SIZE(X)
|
|
+ MAD a7, ALPHA_R, t2, t7
|
|
+ LD a5, 5 * SIZE(X)
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ ST t6, -2 * SIZE(X)
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ ST t7, -1 * SIZE(X)
|
|
+
|
|
+ NMAD a1, ALPHA_I, t0, t4
|
|
+ LD a6, 6 * SIZE(X)
|
|
+ MAD a1, ALPHA_R, t2, t5
|
|
+ LD a7, 7 * SIZE(X)
|
|
+
|
|
+
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ subl I, 1, I
|
|
+ addl X, 8*SIZE, X
|
|
+ bgt I, $Unalign_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$Unalign_MainLoopEnd:
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ ST t4, -8 * SIZE(X)
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ ST t5, -7 * SIZE(X)
|
|
+
|
|
+
|
|
+ NMAD a3, ALPHA_I, t0, t6
|
|
+ MAD a3, ALPHA_R, t2, t7
|
|
+
|
|
+
|
|
+ ST t6, -6 * SIZE(X)
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t7, -5 * SIZE(X)
|
|
+ MUL a4, ALPHA_I, t2
|
|
+
|
|
+
|
|
+ NMAD a5, ALPHA_I, t0, t4
|
|
+ MAD a5, ALPHA_R, t2, t5
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, -4 * SIZE(X)
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ ST t5, -3 * SIZE(X)
|
|
+
|
|
+ NMAD a7, ALPHA_I, t0, t6
|
|
+ MAD a7, ALPHA_R, t2, t7
|
|
+ ST t6, -2 * SIZE(X)
|
|
+ ST t7, -1 * SIZE(X)
|
|
+
|
|
+ .align 4
|
|
+$Unalign_Remain:
|
|
+ and N, 3, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$Unalign_RemainLoop:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a0, ALPHA_I, t2
|
|
+
|
|
+ NMAD a1, ALPHA_I, t0, t4
|
|
+ MAD a1, ALPHA_R, t2, t5
|
|
+
|
|
+ ST t4, 0 * SIZE(X)
|
|
+ ST t5, 1 * SIZE(X)
|
|
+
|
|
+ addl X, 2*SIZE, X
|
|
+ ldi I, -1(I)
|
|
+ bne I, $Unalign_RemainLoop
|
|
+ nop
|
|
+
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ addl INCX, INCX, INCX
|
|
+
|
|
+ sra N, 2, I
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ MUL a1, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ MUL a3, ALPHA_I, t1
|
|
+
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ MUL a3, ALPHA_R, t3
|
|
+ LD a1, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a5, ALPHA_I, t1
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a4, ALPHA_I, t2
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ MUL a5, ALPHA_R, t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t5
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA_I, t1
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ MUL a7, ALPHA_R, t3
|
|
+ LD a5, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ MUL a1, ALPHA_R, t3
|
|
+ LD a7, 1 * SIZE(X)
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ldi I, -1(I)
|
|
+ ADD t2, t3, t5
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ MUL a2, ALPHA_R, t0
|
|
+ MUL a3, ALPHA_I, t1
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a2, ALPHA_I, t2
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ MUL a3, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+ unop
|
|
+
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ MUL a4, ALPHA_R, t0
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+ MUL a5, ALPHA_I, t1
|
|
+ MUL a4, ALPHA_I, t2
|
|
+ MUL a5, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t5
|
|
+ unop
|
|
+
|
|
+ MUL a6, ALPHA_R, t0
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ MUL a7, ALPHA_I, t1
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+
|
|
+ MUL a6, ALPHA_I, t2
|
|
+ MUL a7, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t6
|
|
+ SXADDQ INCX, XX, XX
|
|
+ ADD t2, t3, t7
|
|
+
|
|
+ ST t6, 0 * SIZE(XX)
|
|
+ ST t7, 1 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ and N, 3, I
|
|
+ unop
|
|
+ unop
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ MUL a0, ALPHA_R, t0
|
|
+ MUL a1, ALPHA_I, t1
|
|
+ MUL a0, ALPHA_I, t2
|
|
+ MUL a1, ALPHA_R, t3
|
|
+
|
|
+ SUB t0, t1, t4
|
|
+ ADD t2, t3, t5
|
|
+
|
|
+ ST t4, 0 * SIZE(XX)
|
|
+ ST t5, 1 * SIZE(XX)
|
|
+ SXADDQ INCX, XX, XX
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zsum.S b/kernel/sw_64/zsum.S
|
|
new file mode 100644
|
|
index 0000000..7b8570c
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zsum.S
|
|
@@ -0,0 +1,234 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 88
|
|
+
|
|
+#define N $16
|
|
+#define X $17
|
|
+#define INCX $18
|
|
+#define I $19
|
|
+
|
|
+#define s0 $f0
|
|
+#define s1 $f1
|
|
+#define s2 $f10
|
|
+#define s3 $f11
|
|
+
|
|
+#define a0 $f12
|
|
+#define a1 $f13
|
|
+#define a2 $f14
|
|
+#define a3 $f15
|
|
+#define a4 $f16
|
|
+#define a5 $f17
|
|
+#define a6 $f18
|
|
+#define a7 $f19
|
|
+
|
|
+#define t0 $f20
|
|
+#define t1 $f21
|
|
+#define t2 $f22
|
|
+#define t3 $f23
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+
|
|
+ fclr s0
|
|
+ unop
|
|
+ fclr t0
|
|
+ addw INCX, INCX, $20
|
|
+ mov $20,INCX
|
|
+
|
|
+ fclr s1
|
|
+ unop
|
|
+ fclr t1
|
|
+ ble N, $L999
|
|
+
|
|
+ fclr s2
|
|
+ sra N, 2, I
|
|
+ fclr s3
|
|
+ ble I, $L15
|
|
+
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fclr t2
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fclr t3
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ SXADDQ INCX, X, X
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ble I, $L13
|
|
+ .align 4
|
|
+
|
|
+$L12:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ ldl $31, PREFETCHSIZE * SIZE(X)
|
|
+ fmov a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fmov a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ fmov a2, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fmov a3, t3
|
|
+ unop
|
|
+
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fmov a4, t0
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a2, 0 * SIZE(X)
|
|
+ fmov a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ LD a3, 1 * SIZE(X)
|
|
+ fmov a6, t2
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ LD a4, 0 * SIZE(X)
|
|
+ fmov a7, t3
|
|
+ unop
|
|
+
|
|
+ LD a5, 1 * SIZE(X)
|
|
+ unop
|
|
+ SXADDQ INCX, X, X
|
|
+ bne I, $L12
|
|
+ .align 4
|
|
+
|
|
+$L13:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ LD a6, 0 * SIZE(X)
|
|
+ fmov a0, t0
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a7, 1 * SIZE(X)
|
|
+ fmov a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ fmov a2, t2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ fmov a3, t3
|
|
+
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ fmov a4, t0
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ fmov a5, t1
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ fmov a6, t2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+ fmov a7, t3
|
|
+
|
|
+ ADD s2, t2, $f24
|
|
+ fmov $f24,s2
|
|
+ ADD s3, t3, $f24
|
|
+ fmov $f24,s3
|
|
+
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD s0, s2, $f24
|
|
+ fmov $f24,s0
|
|
+ and N, 3, I
|
|
+ ADD s1, s3, $f24
|
|
+ fmov $f24,s1
|
|
+ ble I, $L999
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ LD a0, 0 * SIZE(X)
|
|
+ fmov a0, t0
|
|
+ ldi I, -1(I)
|
|
+
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+ LD a1, 1 * SIZE(X)
|
|
+ fmov a1, t1
|
|
+ SXADDQ INCX, X, X
|
|
+
|
|
+ bne I, $L17
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ ADD s0, t0, $f24
|
|
+ fmov $f24,s0
|
|
+ ADD s1, t1, $f24
|
|
+ fmov $f24,s1
|
|
+
|
|
+ ADD s0, s1, $f24
|
|
+ fmov $f24,s0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zswap.S.bak b/kernel/sw_64/zswap.S.bak
|
|
new file mode 100644
|
|
index 0000000..f0b19dd
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zswap.S.bak
|
|
@@ -0,0 +1,244 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+ mov $21, $17
|
|
+ ldl $18, 0($sp)
|
|
+ ldl $19, 8($sp)
|
|
+ ldl $20, 16($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ble $16, $SubEnd # if n <= 0 goto $End
|
|
+
|
|
+ cmpeq $18, 1, $1
|
|
+ addl $18, $18, $18
|
|
+ cmpeq $20, 1, $2
|
|
+ addl $20, $20, $20
|
|
+
|
|
+ sra $16, 2, $21
|
|
+ and $1, $2, $1
|
|
+ and $16, 3, $22
|
|
+ beq $1, $Sub
|
|
+
|
|
+ ble $21, $MainRemain
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f11, 1*SIZE($19)
|
|
+ LD $f12, 2*SIZE($19)
|
|
+ LD $f13, 3*SIZE($19)
|
|
+ LD $f14, 4*SIZE($19)
|
|
+ LD $f15, 5*SIZE($19)
|
|
+ LD $f16, 6*SIZE($19)
|
|
+ LD $f17, 7*SIZE($19)
|
|
+
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ LD $f21, 1*SIZE($17)
|
|
+ LD $f22, 2*SIZE($17)
|
|
+ LD $f23, 3*SIZE($17)
|
|
+ LD $f24, 4*SIZE($17)
|
|
+ LD $f25, 5*SIZE($17)
|
|
+ LD $f26, 6*SIZE($17)
|
|
+ LD $f27, 7*SIZE($17)
|
|
+
|
|
+ fillcs 16*SIZE($17)
|
|
+ unop
|
|
+ fillcs 16*SIZE($19)
|
|
+ subl $21, 1, $21
|
|
+
|
|
+ ST $f10, 0*SIZE($17)
|
|
+ ST $f11, 1*SIZE($17)
|
|
+ ST $f12, 2*SIZE($17)
|
|
+ ST $f13, 3*SIZE($17)
|
|
+ ST $f14, 4*SIZE($17)
|
|
+ ST $f15, 5*SIZE($17)
|
|
+ ST $f16, 6*SIZE($17)
|
|
+ ST $f17, 7*SIZE($17)
|
|
+
|
|
+ ST $f20, 0*SIZE($19)
|
|
+ ST $f21, 1*SIZE($19)
|
|
+ ST $f22, 2*SIZE($19)
|
|
+ ST $f23, 3*SIZE($19)
|
|
+ ST $f24, 4*SIZE($19)
|
|
+ ST $f25, 5*SIZE($19)
|
|
+ ST $f26, 6*SIZE($19)
|
|
+ ST $f27, 7*SIZE($19)
|
|
+
|
|
+ ldi $17, 8*SIZE($17)
|
|
+ ldi $19, 8*SIZE($19)
|
|
+ bgt $21, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainRemain:
|
|
+ ble $22, $MainEnd
|
|
+ .align 4
|
|
+
|
|
+$MainRemainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f11, 1*SIZE($19)
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ LD $f21, 1*SIZE($17)
|
|
+
|
|
+ ldi $17, 2*SIZE($17)
|
|
+ ldi $19, 2*SIZE($19)
|
|
+ subl $22, 1, $22
|
|
+ ST $f10, -2*SIZE($17)
|
|
+ ST $f11, -1*SIZE($17)
|
|
+ ST $f20, -2*SIZE($19)
|
|
+ ST $f21, -1*SIZE($19)
|
|
+ bgt $22, $MainRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ mov $17, $23
|
|
+ mov $19, $24
|
|
+ ble $21, $SubRemain
|
|
+ .align 4
|
|
+
|
|
+$SubLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f11, 1*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f12, 0*SIZE($19)
|
|
+ LD $f13, 1*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f14, 0*SIZE($19)
|
|
+ LD $f15, 1*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f16, 0*SIZE($19)
|
|
+ LD $f17, 1*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ LD $f21, 1*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f22, 0*SIZE($17)
|
|
+ LD $f23, 1*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f24, 0*SIZE($17)
|
|
+ LD $f25, 1*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f26, 0*SIZE($17)
|
|
+ LD $f27, 1*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ ST $f10, 0*SIZE($23)
|
|
+ ST $f11, 1*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f12, 0*SIZE($23)
|
|
+ ST $f13, 1*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f14, 0*SIZE($23)
|
|
+ ST $f15, 1*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f16, 0*SIZE($23)
|
|
+ ST $f17, 1*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f20, 0*SIZE($24)
|
|
+ ST $f21, 1*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f22, 0*SIZE($24)
|
|
+ ST $f23, 1*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f24, 0*SIZE($24)
|
|
+ ST $f25, 1*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f26, 0*SIZE($24)
|
|
+ ST $f27, 1*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ subl $21, 1, $21
|
|
+ bgt $21, $SubLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ ble $22, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f11, 1*SIZE($19)
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ LD $f21, 1*SIZE($17)
|
|
+
|
|
+ subl $22, 1, $22
|
|
+
|
|
+ ST $f10, 0*SIZE($17)
|
|
+ ST $f11, 1*SIZE($17)
|
|
+ ST $f20, 0*SIZE($19)
|
|
+ ST $f21, 1*SIZE($19)
|
|
+
|
|
+ SXADDQ $18, $17, $17
|
|
+ SXADDQ $20, $19, $19
|
|
+ bgt $22, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/zswap.c b/kernel/sw_64/zswap.c
|
|
new file mode 100644
|
|
index 0000000..ae4760a
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zswap.c
|
|
@@ -0,0 +1,72 @@
|
|
+/***************************************************************************
|
|
+Copyright (c) 2013, The OpenBLAS Project
|
|
+All rights reserved.
|
|
+Redistribution and use in source and binary forms, with or without
|
|
+modification, are permitted provided that the following conditions are
|
|
+met:
|
|
+1. Redistributions of source code must retain the above copyright
|
|
+notice, this list of conditions and the following disclaimer.
|
|
+2. Redistributions in binary form must reproduce the above copyright
|
|
+notice, this list of conditions and the following disclaimer in
|
|
+the documentation and/or other materials provided with the
|
|
+distribution.
|
|
+3. Neither the name of the OpenBLAS project nor the names of
|
|
+its contributors may be used to endorse or promote products
|
|
+derived from this software without specific prior written permission.
|
|
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+*****************************************************************************/
|
|
+
|
|
+/**************************************************************************************
|
|
+* 2013/09/14 Saar
|
|
+* BLASTEST float : OK
|
|
+* BLASTEST double : OK
|
|
+* CTEST : OK
|
|
+* TEST : OK
|
|
+*
|
|
+**************************************************************************************/
|
|
+
|
|
+#include "common.h"
|
|
+#include <stdio.h>
|
|
+
|
|
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
|
+{
|
|
+ BLASLONG i=0;
|
|
+ BLASLONG ix=0,iy=0;
|
|
+ FLOAT temp[2];
|
|
+ BLASLONG inc_x2;
|
|
+ BLASLONG inc_y2;
|
|
+
|
|
+ if ( n < 0 ) return(0);
|
|
+
|
|
+ inc_x2 = 2 * inc_x;
|
|
+ inc_y2 = 2 * inc_y;
|
|
+
|
|
+ while(i < n)
|
|
+ {
|
|
+
|
|
+ temp[0] = x[ix] ;
|
|
+ temp[1] = x[ix+1] ;
|
|
+ x[ix] = y[iy] ;
|
|
+ x[ix+1] = y[iy+1] ;
|
|
+ y[iy] = temp[0] ;
|
|
+ y[iy+1] = temp[1] ;
|
|
+
|
|
+ ix += inc_x2 ;
|
|
+ iy += inc_y2 ;
|
|
+ i++ ;
|
|
+
|
|
+ }
|
|
+ return(0);
|
|
+
|
|
+}
|
|
+
|
|
+
|
|
diff --git a/kernel/sw_64/zswap_simd.S b/kernel/sw_64/zswap_simd.S
|
|
new file mode 100644
|
|
index 0000000..e49c95b
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/zswap_simd.S
|
|
@@ -0,0 +1,306 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#define PREFETCHSIZE 64
|
|
+#define X $17
|
|
+#define Y $19
|
|
+
|
|
+ PROLOGUE
|
|
+ PROFCODE
|
|
+ .frame $sp, 0, $26, 0
|
|
+
|
|
+ mov $21, $17
|
|
+ ldl $18, 0($sp)
|
|
+ ldl $19, 8($sp)
|
|
+ ldl $20, 16($sp)
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ble $16, $SubEnd # if n <= 0 goto $End
|
|
+
|
|
+ cmpeq $18, 1, $1
|
|
+ addl $18, $18, $18
|
|
+ cmpeq $20, 1, $2
|
|
+ addl $20, $20, $20
|
|
+
|
|
+/*
|
|
+ Unloop 8 complex, 16 real
|
|
+*/
|
|
+
|
|
+ sra $16, 3, $21
|
|
+ and $1, $2, $1
|
|
+ and $16, 7, $22
|
|
+ beq $1, $Sub
|
|
+
|
|
+/*
|
|
+ test the address of Y & X
|
|
+*/
|
|
+ and Y, (VEC_LEN*SIZE-1), $4
|
|
+ and X, (VEC_LEN*SIZE-1), $3
|
|
+ or $3, $4, $4
|
|
+ bne $4, $UnAlign_ACCESS
|
|
+
|
|
+/* align access*/
|
|
+
|
|
+ ble $21, $MainRemain
|
|
+ .align 4
|
|
+
|
|
+$MainLoop:
|
|
+ VLD $f10, 0*VEC_LEN*SIZE(Y)
|
|
+ VLD $f11, 1*VEC_LEN*SIZE(Y)
|
|
+ VLD $f12, 2*VEC_LEN*SIZE(Y)
|
|
+ VLD $f13, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ VLD $f20, 0*VEC_LEN*SIZE(X)
|
|
+ VLD $f21, 1*VEC_LEN*SIZE(X)
|
|
+ VLD $f22, 2*VEC_LEN*SIZE(X)
|
|
+ VLD $f23, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ fillcs PREFETCHSIZE * SIZE(X)
|
|
+ unop
|
|
+ fillcs PREFETCHSIZE * SIZE(Y)
|
|
+ subl $21, 1, $21
|
|
+
|
|
+ VST $f10, 0*VEC_LEN*SIZE(X)
|
|
+ VST $f11, 1*VEC_LEN*SIZE(X)
|
|
+ VST $f12, 2*VEC_LEN*SIZE(X)
|
|
+ VST $f13, 3*VEC_LEN*SIZE(X)
|
|
+
|
|
+ VST $f20, 0*VEC_LEN*SIZE(Y)
|
|
+ VST $f21, 1*VEC_LEN*SIZE(Y)
|
|
+ VST $f22, 2*VEC_LEN*SIZE(Y)
|
|
+ VST $f23, 3*VEC_LEN*SIZE(Y)
|
|
+
|
|
+ ldi $17, 16*SIZE(X)
|
|
+ ldi $19, 16*SIZE(Y)
|
|
+ bgt $21, $MainLoop
|
|
+ .align 4
|
|
+
|
|
+ jmp $MainRemain
|
|
+ .align 4
|
|
+
|
|
+$UnAlign_ACCESS:
|
|
+ sra $16, 2, $21
|
|
+ and $16, 3, $22
|
|
+ nop
|
|
+ ble $21, $MainRemain
|
|
+ .align 4
|
|
+$UnAlign_ACCESS_MainLoop:
|
|
+
|
|
+ LD $f10, 0*SIZE(Y)
|
|
+ LD $f11, 1*SIZE(Y)
|
|
+ LD $f12, 2*SIZE(Y)
|
|
+ LD $f13, 3*SIZE(Y)
|
|
+ LD $f14, 4*SIZE(Y)
|
|
+ LD $f15, 5*SIZE(Y)
|
|
+ LD $f16, 6*SIZE(Y)
|
|
+ LD $f17, 7*SIZE(Y)
|
|
+
|
|
+ LD $f20, 0*SIZE(X)
|
|
+ LD $f21, 1*SIZE(X)
|
|
+ LD $f22, 2*SIZE(X)
|
|
+ LD $f23, 3*SIZE(X)
|
|
+ LD $f24, 4*SIZE(X)
|
|
+ LD $f25, 5*SIZE(X)
|
|
+ LD $f26, 6*SIZE(X)
|
|
+ LD $f27, 7*SIZE(X)
|
|
+
|
|
+ fillcs 16*SIZE(X)
|
|
+ unop
|
|
+ fillcs 16*SIZE(Y)
|
|
+ subl $21, 1, $21
|
|
+
|
|
+ ST $f10, 0*SIZE(X)
|
|
+ ST $f11, 1*SIZE(X)
|
|
+ ST $f12, 2*SIZE(X)
|
|
+ ST $f13, 3*SIZE(X)
|
|
+ ST $f14, 4*SIZE(X)
|
|
+ ST $f15, 5*SIZE(X)
|
|
+ ST $f16, 6*SIZE(X)
|
|
+ ST $f17, 7*SIZE(X)
|
|
+
|
|
+ ST $f20, 0*SIZE(Y)
|
|
+ ST $f21, 1*SIZE(Y)
|
|
+ ST $f22, 2*SIZE(Y)
|
|
+ ST $f23, 3*SIZE(Y)
|
|
+ ST $f24, 4*SIZE(Y)
|
|
+ ST $f25, 5*SIZE(Y)
|
|
+ ST $f26, 6*SIZE(Y)
|
|
+ ST $f27, 7*SIZE(Y)
|
|
+
|
|
+ ldi X, 8*SIZE(X)
|
|
+ ldi Y, 8*SIZE(Y)
|
|
+ bgt $21, $UnAlign_ACCESS_MainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainRemain:
|
|
+ ble $22, $MainEnd
|
|
+ .align 4
|
|
+
|
|
+$MainRemainLoop:
|
|
+ LD $f10, 0*SIZE(Y)
|
|
+ LD $f11, 1*SIZE(Y)
|
|
+ LD $f20, 0*SIZE(X)
|
|
+ LD $f21, 1*SIZE(X)
|
|
+
|
|
+ ldi X, 2*SIZE(X)
|
|
+ ldi Y, 2*SIZE(Y)
|
|
+ subl $22, 1, $22
|
|
+ ST $f10, -2*SIZE(X)
|
|
+ ST $f11, -1*SIZE(X)
|
|
+ ST $f20, -2*SIZE(Y)
|
|
+ ST $f21, -1*SIZE(Y)
|
|
+ bgt $22, $MainRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$MainEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ .align 4
|
|
+
|
|
+$Sub:
|
|
+ sra $16, 2, $21
|
|
+ and $16, 3, $22
|
|
+
|
|
+ mov $17, $23
|
|
+ mov $19, $24
|
|
+ ble $21, $SubRemain
|
|
+ .align 4
|
|
+
|
|
+$SubLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f11, 1*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f12, 0*SIZE($19)
|
|
+ LD $f13, 1*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f14, 0*SIZE($19)
|
|
+ LD $f15, 1*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f16, 0*SIZE($19)
|
|
+ LD $f17, 1*SIZE($19)
|
|
+ SXADDQ $20, $19, $19
|
|
+
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ LD $f21, 1*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f22, 0*SIZE($17)
|
|
+ LD $f23, 1*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f24, 0*SIZE($17)
|
|
+ LD $f25, 1*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ LD $f26, 0*SIZE($17)
|
|
+ LD $f27, 1*SIZE($17)
|
|
+ SXADDQ $18, $17, $17
|
|
+
|
|
+ ST $f10, 0*SIZE($23)
|
|
+ ST $f11, 1*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f12, 0*SIZE($23)
|
|
+ ST $f13, 1*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f14, 0*SIZE($23)
|
|
+ ST $f15, 1*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f16, 0*SIZE($23)
|
|
+ ST $f17, 1*SIZE($23)
|
|
+ SXADDQ $18, $23, $23
|
|
+
|
|
+ ST $f20, 0*SIZE($24)
|
|
+ ST $f21, 1*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f22, 0*SIZE($24)
|
|
+ ST $f23, 1*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f24, 0*SIZE($24)
|
|
+ ST $f25, 1*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ ST $f26, 0*SIZE($24)
|
|
+ ST $f27, 1*SIZE($24)
|
|
+ SXADDQ $20, $24, $24
|
|
+
|
|
+ subl $21, 1, $21
|
|
+ bgt $21, $SubLoop
|
|
+ .align 4
|
|
+
|
|
+$SubRemain:
|
|
+ ble $22, $SubEnd
|
|
+ .align 4
|
|
+
|
|
+$SubRemainLoop:
|
|
+ LD $f10, 0*SIZE($19)
|
|
+ LD $f11, 1*SIZE($19)
|
|
+ LD $f20, 0*SIZE($17)
|
|
+ LD $f21, 1*SIZE($17)
|
|
+
|
|
+ subl $22, 1, $22
|
|
+
|
|
+ ST $f10, 0*SIZE($17)
|
|
+ ST $f11, 1*SIZE($17)
|
|
+ ST $f20, 0*SIZE($19)
|
|
+ ST $f21, 1*SIZE($19)
|
|
+
|
|
+ SXADDQ $18, $17, $17
|
|
+ SXADDQ $20, $19, $19
|
|
+ bgt $22, $SubRemainLoop
|
|
+ .align 4
|
|
+
|
|
+$SubEnd:
|
|
+ clr $0
|
|
+ ret
|
|
+ EPILOGUE
|
|
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S b/kernel/sw_64/ztrsm_kernel_2x2_LN.S
|
|
new file mode 100644
|
|
index 0000000..3a14e58
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S
|
|
@@ -0,0 +1,2593 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 48
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+ .arch sw6a
|
|
+
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+
|
|
+#define STACKSIZE 88
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define tmp $9
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#else
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+CNAME:
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $at, _mcount
|
|
+ jsr $at, ($at), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ stl tmp, 72($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, M, TMP2
|
|
+ mull TMP2, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ TMP2, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negl OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ addl TMP1, TMP1, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 1, J
|
|
+ ble J, $L30
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C2
|
|
+ subl C2, LDC, C1
|
|
+ subl C2, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ and M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+ ble I, $L20
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L28
|
|
+ ble L, $L25
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L28
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+// unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+// unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+// unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+// unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+// unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+// unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+// unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+// unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+// unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+// unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+// unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+// unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+// unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+// unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+// unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+// unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c09, c14, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, c13, b5
|
|
+ fmov b5, c10
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c10, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ble I, $L29
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(KK)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble KK, $L18
|
|
+ ble L, $L15
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble TMP1, $L18
|
|
+ ble L, $L15
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+// unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+// unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+// unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+/* 2 */
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+// unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+// unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+// unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+// unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+// unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+// unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+// unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+// unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+// unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+// unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+// unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ldi L, -2(L)
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+// unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+// unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+// unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+// unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+// unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+// unop
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+// unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+// unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+// unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+// unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+// unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+// unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+// unop
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+// unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+// unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+// unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+// unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+// unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+// unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+// unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+// unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD1 c09, t1,b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, c08, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, c07, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c09, c14, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, c13, b5
|
|
+ fmov b5, c10
|
|
+ ADD c11, c16, b5
|
|
+ fmov b5, c11
|
|
+ ADD c12, c15, b5
|
|
+ fmov b5, c12
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ SUB b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB b2, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB b4, c12, b5
|
|
+ fmov b5, c12
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB b2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB b4, c12, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD6 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c10, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c10, t3
|
|
+ MUL a4, c09, t4
|
|
+
|
|
+ ADD6 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD6 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD5 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c04, t3
|
|
+ MUL a4, c03, t4
|
|
+
|
|
+ ADD6 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD6 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD5 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD6 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c04, 5 * SIZE(BO)
|
|
+ ST c11, 6 * SIZE(BO)
|
|
+ ST c12, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c11, 6 * SIZE(AO)
|
|
+ ST c12, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+ ST c11, 2 * SIZE(C2)
|
|
+ ST c12, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L29:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C1
|
|
+ subl C, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ and M, 1, I
|
|
+ ble I, $L50
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L58
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L52:
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+// unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+// unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+// unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+// unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ sra M, 1, I
|
|
+ ble I, $L59
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble KK, $L48
|
|
+ ble L, $L45
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble TMP1, $L48
|
|
+ ble L, $L45
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L42:
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+// unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+// unop
|
|
+
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+// unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+// unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+// unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+// unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+// unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+// unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+// unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+// unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+// unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+// unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L42
|
|
+ .align 4
|
|
+
|
|
+$L45:
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L47
|
|
+#else
|
|
+ blbs TMP1, $L47
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+// unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+// unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+// unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+// unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L47:
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, c08, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, c07, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+$L48:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t2, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L41
|
|
+ .align 4
|
|
+
|
|
+$L59:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl tmp, 72($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak
|
|
new file mode 100644
|
|
index 0000000..71202d8
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak
|
|
@@ -0,0 +1,2230 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW2B
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+ .arch ev6
|
|
+
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#else
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+CNAME:
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $at, _mcount
|
|
+ jsr $at, ($at), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, M, TMP2
|
|
+ mull TMP2, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ TMP2, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negl OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ addl TMP1, TMP1, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 1, J
|
|
+ ble J, $L30
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C2
|
|
+ subl C2, LDC, C1
|
|
+ subl C2, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ and M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+ ble I, $L20
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L28
|
|
+ ble L, $L25
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L28
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD1 c09, t1, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD3 c10, t2, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c13, t3, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD4 c05, t3, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ ADD3 c10, t2, c10
|
|
+ ADD4 c13, t3, c13
|
|
+ ADD2 c14, t4, c14
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c09, c14, c09
|
|
+ ADD c10, c13, c10
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c09, t1, c09
|
|
+ ADD5 c10, t2, c10
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c10, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ sra M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ble I, $L29
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(KK)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble KK, $L18
|
|
+ ble L, $L15
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble TMP1, $L18
|
|
+ ble L, $L15
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD1 c11, t1, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD1 c01, t1, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD1 c11, t1, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD1 c03, t1, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD1 c11, t1, c11
|
|
+ unop
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD1 c11, t1, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD3 c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD2 c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD1 c03, t1, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD2 c08, t3, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD1 c09, t1, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD2 c14, t3, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c11, t1, c11
|
|
+ ADD3 c12, t2, c12
|
|
+ ADD2 c16, t3, c16
|
|
+ ADD4 c15, t4, c15
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c03, c08, c03
|
|
+ ADD c04, c07, c04
|
|
+
|
|
+ ADD c09, c14, c09
|
|
+ ADD c10, c13, c10
|
|
+ ADD c11, c16, c11
|
|
+ ADD c12, c15, c12
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+
|
|
+ SUB b1, c03, c03
|
|
+ SUB b2, c04, c04
|
|
+ SUB b3, c11, c11
|
|
+ SUB b4, c12, c12
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c09, c09
|
|
+ SUB b2, c10, c10
|
|
+ SUB b3, c11, c11
|
|
+ SUB b4, c12, c12
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c09, t3, c09
|
|
+ SUB c10, t4, c10
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+ ADD6 c09, t3, c09
|
|
+ ADD5 c10, t4, c10
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c10, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c04, t2, c04
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c10, t3
|
|
+ MUL a4, c09, t4
|
|
+
|
|
+ ADD6 c03, t1, c03
|
|
+ ADD5 c04, t2, c04
|
|
+ ADD6 c11, t3, c11
|
|
+ ADD5 c12, t4, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c04, t3
|
|
+ MUL a4, c03, t4
|
|
+
|
|
+ ADD6 c09, t1, c09
|
|
+ ADD5 c10, t2, c10
|
|
+ ADD6 c11, t3, c11
|
|
+ ADD5 c12, t4, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+ ADD6 c03, t3, c03
|
|
+ ADD5 c04, t4, c04
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c04, 5 * SIZE(BO)
|
|
+ ST c11, 6 * SIZE(BO)
|
|
+ ST c12, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c11, 6 * SIZE(AO)
|
|
+ ST c12, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+ ST c11, 2 * SIZE(C2)
|
|
+ ST c12, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L29:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C1
|
|
+ subl C, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ and M, 1, I
|
|
+ ble I, $L50
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L58
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L52:
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c05, t3, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ ADD3 c02, t2, c02
|
|
+ ADD4 c05, t3, c05
|
|
+ ADD2 c06, t4, c06
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ sra M, 1, I
|
|
+ ble I, $L59
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble KK, $L48
|
|
+ ble L, $L45
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble TMP1, $L48
|
|
+ ble L, $L45
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L42:
|
|
+ ADD4 c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L42
|
|
+ .align 4
|
|
+
|
|
+$L45:
|
|
+ ADD4 c05, t1, c05
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L47
|
|
+#else
|
|
+ blbs TMP1, $L47
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L47:
|
|
+ ADD2 c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD1 c03, t3, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ ADD2 c06, t2, c06
|
|
+ ADD4 c07, t3, c07
|
|
+ ADD2 c08, t4, c08
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c03, c08, c03
|
|
+ ADD c04, c07, c04
|
|
+
|
|
+$L48:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c04, t2, c04
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c03, t1, c03
|
|
+ ADD5 c04, t2, c04
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L41
|
|
+ .align 4
|
|
+
|
|
+$L59:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S b/kernel/sw_64/ztrsm_kernel_2x2_LT.S
|
|
new file mode 100644
|
|
index 0000000..bb38b56
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S
|
|
@@ -0,0 +1,2624 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 48
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+ .arch sw6a
|
|
+
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+
|
|
+#define STACKSIZE 88
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define tmp $9
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#else
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+CNAME:
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $at, _mcount
|
|
+ jsr $at, ($at), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ stl tmp, 72($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, M, TMP2
|
|
+ mull TMP2, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ TMP2, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negl OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ addl TMP1, TMP1, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 1, J
|
|
+ ble J, $L30
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C2
|
|
+ subl C2, LDC, C1
|
|
+ subl C2, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(KK)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble KK, $L18
|
|
+ ble L, $L15
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble TMP1, $L18
|
|
+ ble L, $L15
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+/* 2 */
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ldi L, -2(L)
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, c08, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, c07, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c09, c14, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, c13, b5
|
|
+ fmov b5, c10
|
|
+ ADD c11, c16, b5
|
|
+ fmov b5, c11
|
|
+ ADD c12, c15, b5
|
|
+ fmov b5, c12
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ SUB b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB b2, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB b4, c12, b5
|
|
+ fmov b5, c12
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB b2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB b4, c12, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD6 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c10, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c10, t3
|
|
+ MUL a4, c09, t4
|
|
+
|
|
+ ADD6 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD6 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD5 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c04, t3
|
|
+ MUL a4, c03, t4
|
|
+
|
|
+ ADD6 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD6 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD5 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD6 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c04, 5 * SIZE(BO)
|
|
+ ST c11, 6 * SIZE(BO)
|
|
+ ST c12, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c11, 6 * SIZE(AO)
|
|
+ ST c12, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+ ST c11, 2 * SIZE(C2)
|
|
+ ST c12, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 1, I
|
|
+ ble I, $L29
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L28
|
|
+ ble L, $L25
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L28
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c09, c14, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, c13, b5
|
|
+ fmov b5, c10
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c10, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L29:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C1
|
|
+ subl C, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ ble I, $L50
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble KK, $L48
|
|
+ ble L, $L45
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble TMP1, $L48
|
|
+ ble L, $L45
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L42:
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L42
|
|
+ .align 4
|
|
+
|
|
+$L45:
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L47
|
|
+#else
|
|
+ blbs TMP1, $L47
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L47:
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, c08, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, c07, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+$L48:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t2, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L41
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ and M, 1, I
|
|
+ ble I, $L59
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L58
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L52:
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L59:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl tmp, 72($sp)
|
|
+
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak
|
|
new file mode 100644
|
|
index 0000000..f4a2c13
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak
|
|
@@ -0,0 +1,2222 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW2B
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+ .arch ev6
|
|
+
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#else
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+CNAME:
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $at, _mcount
|
|
+ jsr $at, ($at), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, M, TMP2
|
|
+ mull TMP2, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ TMP2, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negl OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ addl TMP1, TMP1, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ sra N, 1, J
|
|
+ ble J, $L30
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C2
|
|
+ subl C2, LDC, C1
|
|
+ subl C2, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(KK)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble KK, $L18
|
|
+ ble L, $L15
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble TMP1, $L18
|
|
+ ble L, $L15
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD1 c11, t1, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD1 c01, t1, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD1 c11, t1, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD1 c03, t1, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD1 c11, t1, c11
|
|
+ unop
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD1 c11, t1, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD3 c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD2 c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD1 c03, t1, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD2 c08, t3, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD1 c09, t1, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD2 c14, t3, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c11, t1, c11
|
|
+ ADD3 c12, t2, c12
|
|
+ ADD2 c16, t3, c16
|
|
+ ADD4 c15, t4, c15
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c03, c08, c03
|
|
+ ADD c04, c07, c04
|
|
+
|
|
+ ADD c09, c14, c09
|
|
+ ADD c10, c13, c10
|
|
+ ADD c11, c16, c11
|
|
+ ADD c12, c15, c12
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+
|
|
+ SUB b1, c03, c03
|
|
+ SUB b2, c04, c04
|
|
+ SUB b3, c11, c11
|
|
+ SUB b4, c12, c12
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c09, c09
|
|
+ SUB b2, c10, c10
|
|
+ SUB b3, c11, c11
|
|
+ SUB b4, c12, c12
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c09, t3, c09
|
|
+ SUB c10, t4, c10
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+ ADD6 c09, t3, c09
|
|
+ ADD5 c10, t4, c10
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c10, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c04, t2, c04
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c10, t3
|
|
+ MUL a4, c09, t4
|
|
+
|
|
+ ADD6 c03, t1, c03
|
|
+ ADD5 c04, t2, c04
|
|
+ ADD6 c11, t3, c11
|
|
+ ADD5 c12, t4, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c04, t3
|
|
+ MUL a4, c03, t4
|
|
+
|
|
+ ADD6 c09, t1, c09
|
|
+ ADD5 c10, t2, c10
|
|
+ ADD6 c11, t3, c11
|
|
+ ADD5 c12, t4, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+ ADD6 c03, t3, c03
|
|
+ ADD5 c04, t4, c04
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c04, 5 * SIZE(BO)
|
|
+ ST c11, 6 * SIZE(BO)
|
|
+ ST c12, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c11, 6 * SIZE(AO)
|
|
+ ST c12, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+ ST c11, 2 * SIZE(C2)
|
|
+ ST c12, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 1, I
|
|
+ ble I, $L29
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L28
|
|
+ ble L, $L25
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L28
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD1 c09, t1, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD3 c10, t2, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c13, t3, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD4 c05, t3, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ ADD3 c10, t2, c10
|
|
+ ADD4 c13, t3, c13
|
|
+ ADD2 c14, t4, c14
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c09, c14, c09
|
|
+ ADD c10, c13, c10
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c09, t1, c09
|
|
+ ADD5 c10, t2, c10
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c10, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L29:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ and N, 1, J
|
|
+ ble J, $L999
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C1
|
|
+ subl C, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ ble I, $L50
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble KK, $L48
|
|
+ ble L, $L45
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble TMP1, $L48
|
|
+ ble L, $L45
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L42:
|
|
+ ADD4 c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L42
|
|
+ .align 4
|
|
+
|
|
+$L45:
|
|
+ ADD4 c05, t1, c05
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L47
|
|
+#else
|
|
+ blbs TMP1, $L47
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L47:
|
|
+ ADD2 c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD1 c03, t3, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ ADD2 c06, t2, c06
|
|
+ ADD4 c07, t3, c07
|
|
+ ADD2 c08, t4, c08
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c03, c08, c03
|
|
+ ADD c04, c07, c04
|
|
+
|
|
+$L48:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c04, t2, c04
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c03, t1, c03
|
|
+ ADD5 c04, t2, c04
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L41
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ and M, 1, I
|
|
+ ble I, $L59
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L58
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L52:
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c05, t3, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ ADD3 c02, t2, c02
|
|
+ ADD4 c05, t3, c05
|
|
+ ADD2 c06, t4, c06
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L59:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S b/kernel/sw_64/ztrsm_kernel_2x2_RT.S
|
|
new file mode 100644
|
|
index 0000000..97dbc16
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S
|
|
@@ -0,0 +1,2623 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW6
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+#ifdef EV5
|
|
+#define PREFETCHSIZE 48
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+#ifdef EV4
|
|
+#define UNOP
|
|
+#endif
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+ .arch sw6a
|
|
+
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+
|
|
+#define STACKSIZE 88
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define tmp $9
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#else
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+CNAME:
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $at, _mcount
|
|
+ jsr $at, ($at), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+ stl tmp, 72($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, M, TMP2
|
|
+ mull TMP2, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ TMP2, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negl OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ addl TMP1, TMP1, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ and N, 1, J
|
|
+ ble J, $L30
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C1
|
|
+ subl C, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ ble I, $L50
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble KK, $L48
|
|
+ ble L, $L45
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble TMP1, $L48
|
|
+ ble L, $L45
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L42:
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L42
|
|
+ .align 4
|
|
+
|
|
+$L45:
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L47
|
|
+#else
|
|
+ blbs TMP1, $L47
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L47:
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD1 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD3 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t1, b5
|
|
+ fmov b5, c05
|
|
+ ADD2 c06, t2, b5
|
|
+ fmov b5, c06
|
|
+ ADD4 c07, t3, b5
|
|
+ fmov b5, c07
|
|
+ ADD2 c08, t4, b5
|
|
+ fmov b5, c08
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, c08, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, c07, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+$L48:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t2, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L41
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ and M, 1, I
|
|
+ ble I, $L59
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L58
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L52:
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L59:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ sra N, 1, J
|
|
+ ble J, $L999
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C2
|
|
+ subl C2, LDC, C1
|
|
+ subl C2, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(KK)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble KK, $L18
|
|
+ ble L, $L15
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble TMP1, $L18
|
|
+ ble L, $L15
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+/* 2 */
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ldi L, -2(L)
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ unop
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD2 c06, t3, b5
|
|
+ fmov b5, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD4 c05, t4, b5
|
|
+ fmov b5, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD1 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD3 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD2 c08, t3, b5
|
|
+ fmov b5, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD4 c13, t4, b5
|
|
+ fmov b5, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD2 c14, t3, b5
|
|
+ fmov b5, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD4 c07, t4, b5
|
|
+ fmov b5, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c11, t1, b5
|
|
+ fmov b5, c11
|
|
+ ADD3 c12, t2, b5
|
|
+ fmov b5, c12
|
|
+ ADD2 c16, t3, b5
|
|
+ fmov b5, c16
|
|
+ ADD4 c15, t4, b5
|
|
+ fmov b5, c15
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c03, c08, b5
|
|
+ fmov b5, c03
|
|
+ ADD c04, c07, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD c09, c14, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, c13, b5
|
|
+ fmov b5, c10
|
|
+ ADD c11, c16, b5
|
|
+ fmov b5, c11
|
|
+ ADD c12, c15, b5
|
|
+ fmov b5, c12
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ SUB b1, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB b2, c04, b5
|
|
+ fmov b5, c04
|
|
+ SUB b3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB b4, c12, b5
|
|
+ fmov b5, c12
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c03, b5
|
|
+ fmov b5, c03
|
|
+ SUB a4, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ SUB b1, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB b2, c10, b5
|
|
+ fmov b5, c10
|
|
+ SUB b3, c11, b5
|
|
+ fmov b5, c11
|
|
+ SUB b4, c12, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD6 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c10, t4
|
|
+
|
|
+ SUB c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c10, t3
|
|
+ MUL a4, c09, t4
|
|
+
|
|
+ ADD6 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD6 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD5 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c03, t1, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t2, b5
|
|
+ fmov b5, c04
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ SUB c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ SUB c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c04, t3
|
|
+ MUL a4, c03, t4
|
|
+
|
|
+ ADD6 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD6 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD5 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+ MUL a1, c11, b5
|
|
+ fmov b5, c11
|
|
+ MUL a1, c12, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD5 c11, t3, b5
|
|
+ fmov b5, c11
|
|
+ ADD6 c12, t4, b5
|
|
+ fmov b5, c12
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ SUB c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ SUB c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD6 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD5 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c03, b5
|
|
+ fmov b5, c03
|
|
+ MUL a1, c04, b5
|
|
+ fmov b5, c04
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c03, t3, b5
|
|
+ fmov b5, c03
|
|
+ ADD6 c04, t4, b5
|
|
+ fmov b5, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c04, 5 * SIZE(BO)
|
|
+ ST c11, 6 * SIZE(BO)
|
|
+ ST c12, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c11, 6 * SIZE(AO)
|
|
+ ST c12, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+ ST c11, 2 * SIZE(C2)
|
|
+ ST c12, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 1, I
|
|
+ ble I, $L29
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L28
|
|
+ ble L, $L25
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L28
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+ FIMOVD b5, tmp
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ IFMOVD tmp, b5
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD1 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD3 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD4 c05, t3, b5
|
|
+ fmov b5, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD2 c06, t4, b5
|
|
+ fmov b5, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD3 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+ ADD4 c13, t3, b5
|
|
+ fmov b5, c13
|
|
+ ADD2 c14, t4, b5
|
|
+ fmov b5, c14
|
|
+
|
|
+ ADD c01, c06, b5
|
|
+ fmov b5, c01
|
|
+ ADD c02, c05, b5
|
|
+ fmov b5, c02
|
|
+ ADD c09, c14, b5
|
|
+ fmov b5, c09
|
|
+ ADD c10, c13, b5
|
|
+ fmov b5, c10
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ SUB a2, c02, b5
|
|
+ fmov b5, c02
|
|
+ SUB a3, c09, b5
|
|
+ fmov b5, c09
|
|
+ SUB a4, c10, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+ ADD5 c09, t3, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t4, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ SUB c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ SUB c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD5 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, b5
|
|
+ fmov b5, c09
|
|
+ MUL a1, c10, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ ADD5 c09, t1, b5
|
|
+ fmov b5, c09
|
|
+ ADD6 c10, t2, b5
|
|
+ fmov b5, c10
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ SUB c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ SUB c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ ADD6 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD5 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, b5
|
|
+ fmov b5, c01
|
|
+ MUL a1, c02, b5
|
|
+ fmov b5, c02
|
|
+
|
|
+ ADD5 c01, t1, b5
|
|
+ fmov b5, c01
|
|
+ ADD6 c02, t2, b5
|
|
+ fmov b5, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c10, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L29:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ ldl tmp, 72($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak
|
|
new file mode 100644
|
|
index 0000000..4d4f59d
|
|
--- /dev/null
|
|
+++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak
|
|
@@ -0,0 +1,2223 @@
|
|
+/*********************************************************************/
|
|
+/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
+/* All rights reserved. */
|
|
+/* */
|
|
+/* Redistribution and use in source and binary forms, with or */
|
|
+/* without modification, are permitted provided that the following */
|
|
+/* conditions are met: */
|
|
+/* */
|
|
+/* 1. Redistributions of source code must retain the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer. */
|
|
+/* */
|
|
+/* 2. Redistributions in binary form must reproduce the above */
|
|
+/* copyright notice, this list of conditions and the following */
|
|
+/* disclaimer in the documentation and/or other materials */
|
|
+/* provided with the distribution. */
|
|
+/* */
|
|
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
+/* POSSIBILITY OF SUCH DAMAGE. */
|
|
+/* */
|
|
+/* The views and conclusions contained in the software and */
|
|
+/* documentation are those of the authors and should not be */
|
|
+/* interpreted as representing official policies, either expressed */
|
|
+/* or implied, of The University of Texas at Austin. */
|
|
+/*********************************************************************/
|
|
+
|
|
+#define ASSEMBLER
|
|
+#include "common.h"
|
|
+#include "version.h"
|
|
+
|
|
+
|
|
+#if !defined(SW2B)
|
|
+#error "Architecture is not specified."
|
|
+#endif
|
|
+
|
|
+#ifdef SW2B
|
|
+#define PREFETCHSIZE 56
|
|
+#define UNOP unop
|
|
+#endif
|
|
+
|
|
+
|
|
+ .set noat
|
|
+ .set noreorder
|
|
+ .arch ev6
|
|
+
|
|
+.text
|
|
+ .align 5
|
|
+ .globl CNAME
|
|
+ .ent CNAME
|
|
+
|
|
+#define STACKSIZE 80
|
|
+
|
|
+#define M $16
|
|
+#define N $17
|
|
+#define K $18
|
|
+#define A $21
|
|
+#define B $22
|
|
+#define C $20
|
|
+#define LDC $23
|
|
+
|
|
+#define C1 $19
|
|
+#define C2 $24
|
|
+
|
|
+#define AO $at
|
|
+#define BO $5
|
|
+#define I $6
|
|
+#define J $7
|
|
+#define L $8
|
|
+
|
|
+#define a1 $f16
|
|
+#define a2 $f17
|
|
+#define a3 $f18
|
|
+#define a4 $f19
|
|
+
|
|
+#define b1 $f20
|
|
+#define b2 $f21
|
|
+#define b3 $f22
|
|
+#define b4 $f23
|
|
+
|
|
+#define t1 $f24
|
|
+#define t2 $f25
|
|
+#define t3 $f26
|
|
+#define t4 $f27
|
|
+
|
|
+#define a5 $f28
|
|
+#define a6 $f30
|
|
+#define b5 $f29
|
|
+
|
|
+#define alpha_i $f29
|
|
+#define alpha_r $f30
|
|
+
|
|
+#define c01 $f0
|
|
+#define c02 $f1
|
|
+#define c03 $f2
|
|
+#define c04 $f3
|
|
+
|
|
+#define c05 $f4
|
|
+#define c06 $f5
|
|
+#define c07 $f6
|
|
+#define c08 $f7
|
|
+
|
|
+#define c09 $f8
|
|
+#define c10 $f9
|
|
+#define c11 $f10
|
|
+#define c12 $f11
|
|
+
|
|
+#define c13 $f12
|
|
+#define c14 $f13
|
|
+#define c15 $f14
|
|
+#define c16 $f15
|
|
+
|
|
+#define TMP1 $0
|
|
+#define TMP2 $1
|
|
+#define KK $2
|
|
+#define AORIG $3
|
|
+#define OFFSET $4
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 SUB
|
|
+#define ADD4 ADD
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#else
|
|
+#ifndef CONJ
|
|
+#define ADD1 ADD
|
|
+#define ADD2 SUB
|
|
+#define ADD3 ADD
|
|
+#define ADD4 ADD
|
|
+#define ADD5 SUB
|
|
+#define ADD6 ADD
|
|
+#else
|
|
+#define ADD1 ADD
|
|
+#define ADD2 ADD
|
|
+#define ADD3 ADD
|
|
+#define ADD4 SUB
|
|
+#define ADD5 ADD
|
|
+#define ADD6 SUB
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+
|
|
+CNAME:
|
|
+ .frame $sp, STACKSIZE, $26, 0
|
|
+
|
|
+#ifdef PROFILE
|
|
+ ldgp $gp, 0($27)
|
|
+ ldi $at, _mcount
|
|
+ jsr $at, ($at), _mcount
|
|
+#endif
|
|
+
|
|
+#ifndef PROFILE
|
|
+ .prologue 0
|
|
+#else
|
|
+ .prologue 1
|
|
+#endif
|
|
+
|
|
+ ldi $sp, -STACKSIZE($sp)
|
|
+
|
|
+ ldl B, 0 + STACKSIZE($sp)
|
|
+ ldl C, 8 + STACKSIZE($sp)
|
|
+ ldl LDC, 16 + STACKSIZE($sp)
|
|
+ ldl OFFSET, 24 + STACKSIZE($sp)
|
|
+
|
|
+ sll LDC, ZBASE_SHIFT, LDC
|
|
+
|
|
+ fstd $f2, 0($sp)
|
|
+ fstd $f3, 8($sp)
|
|
+ fstd $f4, 16($sp)
|
|
+ fstd $f5, 24($sp)
|
|
+ fstd $f6, 32($sp)
|
|
+ fstd $f7, 40($sp)
|
|
+ fstd $f8, 48($sp)
|
|
+ fstd $f9, 56($sp)
|
|
+
|
|
+ cmple M, 0, $0
|
|
+ cmple N, 0, $1
|
|
+ cmple K, 0, $2
|
|
+
|
|
+ or $0, $1, $0
|
|
+ or $0, $2, $0
|
|
+ bne $0, $L999
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, M, TMP2
|
|
+ mull TMP2, K, TMP1
|
|
+ SXADDQ TMP1, A, A
|
|
+ SXADDQ TMP2, C, C
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ negl OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ mull N, K, TMP1
|
|
+ addl TMP1, TMP1, TMP1
|
|
+ SXADDQ TMP1, B, B
|
|
+
|
|
+ mull N, LDC, TMP1
|
|
+ addl TMP1, C, C
|
|
+
|
|
+ subl N, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+ and N, 1, J
|
|
+ ble J, $L30
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C1
|
|
+ subl C, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ ble I, $L50
|
|
+ .align 4
|
|
+
|
|
+$L41:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble KK, $L48
|
|
+ ble L, $L45
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c04
|
|
+ fclr c08
|
|
+
|
|
+ ble TMP1, $L48
|
|
+ ble L, $L45
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L42:
|
|
+ ADD4 c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ ldi L, -2(L)
|
|
+ MUL a2, b1, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b1, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ unop
|
|
+ MUL a4, b2, t4
|
|
+ LD a5, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ unop
|
|
+
|
|
+ ADD4 c07, t3, c07
|
|
+ unop
|
|
+ MUL a3, b3, t3
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a5, b3, t4
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b4, t1
|
|
+ LD a1, -4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b4, t2
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a5, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ bgt L, $L42
|
|
+ .align 4
|
|
+
|
|
+$L45:
|
|
+ ADD4 c05, t1, c05
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L47
|
|
+#else
|
|
+ blbs TMP1, $L47
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD2 c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ unop
|
|
+ MUL a4, b1, t4
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b2, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b2, t2
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c03, t3, c03
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ MUL a4, b2, t4
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L47:
|
|
+ ADD2 c06, t2, c06
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c07, t3, c07
|
|
+ MUL a3, b1, t3
|
|
+
|
|
+ ADD2 c08, t4, c08
|
|
+ MUL a4, b1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b2, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b2, t2
|
|
+ ADD1 c03, t3, c03
|
|
+ MUL a3, b2, t3
|
|
+
|
|
+ ADD3 c04, t4, c04
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a4, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t1, c05
|
|
+ ADD2 c06, t2, c06
|
|
+ ADD4 c07, t3, c07
|
|
+ ADD2 c08, t4, c08
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c03, c08, c03
|
|
+ ADD c04, c07, c04
|
|
+
|
|
+$L48:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 1, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c04, t2, c04
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c03, t1, c03
|
|
+ ADD5 c04, t2, c04
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c03, 2 * SIZE(BO)
|
|
+ ST c04, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L41
|
|
+ .align 4
|
|
+
|
|
+$L50:
|
|
+ and M, 1, I
|
|
+ ble I, $L59
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L58
|
|
+ ble L, $L55
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr t1
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr t2
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr t4
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c01
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c05
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c02
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c06
|
|
+
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L58
|
|
+ ble L, $L55
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L52:
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ ldi L, -2(L)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ LD b2, 3 * SIZE(BO)
|
|
+ MUL a3, b3, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b4, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a4, b4, t4
|
|
+ LD b4, 1 * SIZE(BO)
|
|
+ unop
|
|
+
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ unop
|
|
+ unop
|
|
+ bgt L, $L52
|
|
+ .align 4
|
|
+
|
|
+$L55:
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L57
|
|
+#else
|
|
+ blbs TMP1, $L57
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+ MUL a1, b2, t3
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ LD b2, -1 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ .align 4
|
|
+
|
|
+$L57:
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c05, t3, c05
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b2, t4
|
|
+ ldi BO, 2 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ ADD3 c02, t2, c02
|
|
+ ADD4 c05, t3, c05
|
|
+ ADD2 c06, t4, c06
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+
|
|
+$L58:
|
|
+#if defined(LN) || defined(RT)
|
|
+ subl KK, 1, TMP1
|
|
+
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -2 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(RN) || defined(RT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L59:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L30:
|
|
+ sra N, 1, J
|
|
+ ble J, $L999
|
|
+ .align 4
|
|
+
|
|
+$L01:
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl B, TMP1, B
|
|
+
|
|
+ subl C, LDC, C2
|
|
+ subl C2, LDC, C1
|
|
+ subl C2, LDC, C
|
|
+#else
|
|
+ mov C, C1
|
|
+ addl C, LDC, C2
|
|
+ addl C2, LDC, C
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ addl M, OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ mov OFFSET, KK
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(RT)
|
|
+ mov A, AORIG
|
|
+#else
|
|
+ mov A, AO
|
|
+#endif
|
|
+
|
|
+ sra M, 1, I
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ble I, $L20
|
|
+ .align 4
|
|
+
|
|
+$L11:
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(KK)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble KK, $L18
|
|
+ ble L, $L15
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ fclr c03
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ fclr c07
|
|
+
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ fclr c11
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ fclr c15
|
|
+
|
|
+ fillcs 4 * SIZE(C1)
|
|
+ fclr c04
|
|
+ ldi L, -2(TMP1)
|
|
+ fclr c08
|
|
+
|
|
+ fillcs 4 * SIZE(C2)
|
|
+ fclr c12
|
|
+ fclr c16
|
|
+ ble TMP1, $L18
|
|
+ ble L, $L15
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L12:
|
|
+/* 1 */
|
|
+ ADD1 c11, t1, c11
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(AO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+ MUL b1, a1, t1
|
|
+#ifndef EV4
|
|
+ fillcs PREFETCHSIZE * SIZE(BO)
|
|
+#else
|
|
+ unop
|
|
+#endif
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ unop
|
|
+ MUL b1, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ LD a5, 0 * SIZE(AO)
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a1, t4
|
|
+ LD b5, 0 * SIZE(BO)
|
|
+
|
|
+/* 2 */
|
|
+ ADD1 c01, t1, c01
|
|
+ UNOP
|
|
+ MUL b1, a3, t1
|
|
+ UNOP
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ UNOP
|
|
+ MUL b1, a4, t2
|
|
+ UNOP
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a1, t4
|
|
+ unop
|
|
+
|
|
+/* 3 */
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+/* 4 */
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ LD a6, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+
|
|
+/* 5 */
|
|
+ ADD1 c11, t1, c11
|
|
+ unop
|
|
+ MUL b5, a5, t1
|
|
+ LD a1, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ ldi L, -2(L)
|
|
+ MUL b5, a2, t2
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+
|
|
+ ADD2 c16, t3, c16
|
|
+ unop
|
|
+ MUL b2, a2, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ unop
|
|
+ MUL b2, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 6 */
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL b5, a6, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b5, a4, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ unop
|
|
+ MUL b2, a4, t3
|
|
+ unop
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ unop
|
|
+ MUL b4, a5, t4
|
|
+ unop
|
|
+
|
|
+/* 7 */
|
|
+ ADD1 c03, t1, c03
|
|
+ ldi AO, 8 * SIZE(AO)
|
|
+ MUL b3, a5, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, -3 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a6, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+/* 8 */
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a6, t1
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ MUL b4, a6, t4
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+ bgt L, $L12
|
|
+ .align 4
|
|
+
|
|
+$L15:
|
|
+ ADD1 c11, t1, c11
|
|
+ unop
|
|
+ MUL b1, a1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L17
|
|
+#else
|
|
+ blbs TMP1, $L17
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL b1, a4, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD2 c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+ ADD4 c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+
|
|
+ ADD1 c03, t1, c03
|
|
+ unop
|
|
+ MUL b3, a1, t1
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ unop
|
|
+ MUL b3, a2, t2
|
|
+ unop
|
|
+
|
|
+ ADD2 c08, t3, c08
|
|
+ unop
|
|
+ MUL b4, a2, t3
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ unop
|
|
+ MUL b2, a3, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL b3, a3, t1
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL b3, a4, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t3, c14
|
|
+ unop
|
|
+ MUL b4, a4, t3
|
|
+ LD a4, -1 * SIZE(AO)
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ unop
|
|
+ MUL b4, a3, t4
|
|
+ LD a3, -2 * SIZE(AO)
|
|
+
|
|
+ ADD1 c11, t1, c11
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL b1, a1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L17:
|
|
+ ADD3 c12, t2, c12
|
|
+ MUL b1, a2, t2
|
|
+ ADD2 c16, t3, c16
|
|
+ MUL b2, a2, t3
|
|
+
|
|
+ ADD4 c15, t4, c15
|
|
+ MUL b2, a1, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL b1, a3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL b1, a4, t2
|
|
+ ADD2 c06, t3, c06
|
|
+ MUL b2, a4, t3
|
|
+
|
|
+ ADD4 c05, t4, c05
|
|
+ MUL b4, a1, t4
|
|
+ ADD1 c03, t1, c03
|
|
+ MUL b3, a1, t1
|
|
+
|
|
+ ADD3 c04, t2, c04
|
|
+ MUL b3, a2, t2
|
|
+ ADD2 c08, t3, c08
|
|
+ MUL b4, a2, t3
|
|
+
|
|
+ ADD4 c13, t4, c13
|
|
+ MUL b2, a3, t4
|
|
+ ADD1 c09, t1, c09
|
|
+ MUL b3, a3, t1
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ MUL b3, a4, t2
|
|
+ ADD2 c14, t3, c14
|
|
+ MUL b4, a4, t3
|
|
+
|
|
+ ADD4 c07, t4, c07
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+ MUL b4, a3, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c11, t1, c11
|
|
+ ADD3 c12, t2, c12
|
|
+ ADD2 c16, t3, c16
|
|
+ ADD4 c15, t4, c15
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c03, c08, c03
|
|
+ ADD c04, c07, c04
|
|
+
|
|
+ ADD c09, c14, c09
|
|
+ ADD c10, c13, c10
|
|
+ ADD c11, c16, c11
|
|
+ ADD c12, c15, c12
|
|
+ .align 4
|
|
+
|
|
+$L18:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 2, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -4 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ LD b1, 4 * SIZE(BO)
|
|
+ LD b2, 5 * SIZE(BO)
|
|
+ LD b3, 6 * SIZE(BO)
|
|
+ LD b4, 7 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+
|
|
+ SUB b1, c03, c03
|
|
+ SUB b2, c04, c04
|
|
+ SUB b3, c11, c11
|
|
+ SUB b4, c12, c12
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ LD b1, 4 * SIZE(AO)
|
|
+ LD b2, 5 * SIZE(AO)
|
|
+ LD b3, 6 * SIZE(AO)
|
|
+ LD b4, 7 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c03, c03
|
|
+ SUB a4, c04, c04
|
|
+
|
|
+ SUB b1, c09, c09
|
|
+ SUB b2, c10, c10
|
|
+ SUB b3, c11, c11
|
|
+ SUB b4, c12, c12
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+ LD a3, 4 * SIZE(AO)
|
|
+ LD a4, 5 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+
|
|
+ MUL a3, c03, t1
|
|
+ MUL a3, c04, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c09, t3, c09
|
|
+ SUB c10, t4, c10
|
|
+
|
|
+ MUL a4, c04, t1
|
|
+ MUL a4, c03, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+ ADD6 c09, t3, c09
|
|
+ ADD5 c10, t4, c10
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c09, t3
|
|
+ MUL a3, c10, t4
|
|
+
|
|
+ SUB c03, t1, c03
|
|
+ SUB c04, t2, c04
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c10, t3
|
|
+ MUL a4, c09, t4
|
|
+
|
|
+ ADD6 c03, t1, c03
|
|
+ ADD5 c04, t2, c04
|
|
+ ADD6 c11, t3, c11
|
|
+ ADD5 c12, t4, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(AO)
|
|
+ LD a2, 7 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c04, t1
|
|
+ MUL a2, c03, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c03, t1, c03
|
|
+ ADD6 c04, t2, c04
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ MUL a3, c03, t3
|
|
+ MUL a3, c04, t4
|
|
+
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+ SUB c11, t3, c11
|
|
+ SUB c12, t4, c12
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ MUL a4, c04, t3
|
|
+ MUL a4, c03, t4
|
|
+
|
|
+ ADD6 c09, t1, c09
|
|
+ ADD5 c10, t2, c10
|
|
+ ADD6 c11, t3, c11
|
|
+ ADD5 c12, t4, c12
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a2, c12, t3
|
|
+ MUL a2, c11, t4
|
|
+
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+ MUL a1, c11, c11
|
|
+ MUL a1, c12, c12
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+ ADD5 c11, t3, c11
|
|
+ ADD6 c12, t4, c12
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ MUL a3, c11, t3
|
|
+ MUL a3, c12, t4
|
|
+
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+ SUB c03, t3, c03
|
|
+ SUB c04, t4, c04
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ MUL a4, c12, t3
|
|
+ MUL a4, c11, t4
|
|
+
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+ ADD6 c03, t3, c03
|
|
+ ADD5 c04, t4, c04
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c04, t3
|
|
+ MUL a2, c03, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c03, c03
|
|
+ MUL a1, c04, c04
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c03, t3, c03
|
|
+ ADD6 c04, t4, c04
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+
|
|
+ ST c03, 4 * SIZE(BO)
|
|
+ ST c04, 5 * SIZE(BO)
|
|
+ ST c11, 6 * SIZE(BO)
|
|
+ ST c12, 7 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c03, 2 * SIZE(AO)
|
|
+ ST c04, 3 * SIZE(AO)
|
|
+
|
|
+ ST c09, 4 * SIZE(AO)
|
|
+ ST c10, 5 * SIZE(AO)
|
|
+ ST c11, 6 * SIZE(AO)
|
|
+ ST c12, 7 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -4 * SIZE(C1)
|
|
+ ldi C2, -4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c03, 2 * SIZE(C1)
|
|
+ ST c04, 3 * SIZE(C1)
|
|
+
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+ ST c11, 2 * SIZE(C2)
|
|
+ ST c12, 3 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 4 * SIZE(C1)
|
|
+ ldi C2, 4 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ fclr t1
|
|
+ fclr t2
|
|
+ fclr t3
|
|
+ fclr t4
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
|
|
+ addl AO, TMP1, AO
|
|
+ addl BO, TMP1, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+ fclr c01
|
|
+ fclr c05
|
|
+
|
|
+ ldi I, -1(I)
|
|
+ bgt I, $L11
|
|
+ .align 4
|
|
+
|
|
+$L20:
|
|
+ and M, 1, I
|
|
+ ble I, $L29
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(B)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(B)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(B)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(B)
|
|
+ ldi BO, 4 * SIZE(B)
|
|
+
|
|
+ ldi L, -2(KK)
|
|
+
|
|
+ ble KK, $L28
|
|
+ ble L, $L25
|
|
+#else
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 0, TMP1
|
|
+ subl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+ sll KK, ZBASE_SHIFT + 0, TMP1
|
|
+ addl AORIG, TMP1, AO
|
|
+ sll KK, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, BO
|
|
+
|
|
+ subl K, KK, TMP1
|
|
+
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ fclr c09
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ fclr c13
|
|
+
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ fclr c02
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+ fclr c06
|
|
+
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+ fclr c10
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+ fclr c14
|
|
+
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ldi L, -2(TMP1)
|
|
+
|
|
+ ble TMP1, $L28
|
|
+ ble L, $L25
|
|
+#endif
|
|
+ .align 5
|
|
+
|
|
+$L22:
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL a1, b1, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ ldi BO, 8 * SIZE(BO)
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, -7 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ unop
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, -6 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, 2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a2, b4, t4
|
|
+ LD b5, -5 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ unop
|
|
+ MUL a3, b1, t1
|
|
+ LD a2, 3 * SIZE(AO)
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a4, b1, t2
|
|
+ LD b1, -4 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a3, b2, t3
|
|
+ ldi AO, 4 * SIZE(AO)
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ MUL a4, b2, t4
|
|
+ LD b2, -3 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ ldi L, -2(L)
|
|
+ MUL a3, b3, t1
|
|
+ LD b4, -1 * SIZE(BO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a4, b3, t2
|
|
+ LD b3, -2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a3, b5, t3
|
|
+ LD a3, 0 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ MUL a4, b5, t4
|
|
+ LD a4, 1 * SIZE(AO)
|
|
+ bgt L, $L22
|
|
+ .align 4
|
|
+
|
|
+$L25:
|
|
+ ADD1 c09, t1, c09
|
|
+ MUL a1, b1, t1
|
|
+#if defined(LT) || defined(RN)
|
|
+ blbs KK, $L27
|
|
+#else
|
|
+ blbs TMP1, $L27
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+ ADD3 c10, t2, c10
|
|
+ unop
|
|
+ MUL a2, b1, t2
|
|
+ LD b1, 0 * SIZE(BO)
|
|
+
|
|
+ ADD4 c13, t3, c13
|
|
+ unop
|
|
+ MUL a1, b2, t3
|
|
+ unop
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ unop
|
|
+ MUL a2, b2, t4
|
|
+ LD b2, 1 * SIZE(BO)
|
|
+
|
|
+ ADD1 c01, t1, c01
|
|
+ unop
|
|
+ MUL a1, b3, t1
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ unop
|
|
+ MUL a2, b3, t2
|
|
+ LD b3, 2 * SIZE(BO)
|
|
+
|
|
+ ADD4 c05, t3, c05
|
|
+ unop
|
|
+ MUL a1, b4, t3
|
|
+ LD a1, -2 * SIZE(AO)
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ unop
|
|
+ MUL a2, b4, t4
|
|
+ LD a2, -1 * SIZE(AO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ LD b4, 3 * SIZE(BO)
|
|
+ MUL a1, b1, t1
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+ .align 4
|
|
+
|
|
+$L27:
|
|
+ ADD3 c10, t2, c10
|
|
+ MUL a2, b1, t2
|
|
+ ADD4 c13, t3, c13
|
|
+ MUL a1, b2, t3
|
|
+
|
|
+ ADD2 c14, t4, c14
|
|
+ MUL a2, b2, t4
|
|
+ ADD1 c01, t1, c01
|
|
+ MUL a1, b3, t1
|
|
+
|
|
+ ADD3 c02, t2, c02
|
|
+ MUL a2, b3, t2
|
|
+ ADD4 c05, t3, c05
|
|
+ MUL a1, b4, t3
|
|
+
|
|
+ ADD2 c06, t4, c06
|
|
+ ldi AO, 2 * SIZE(AO)
|
|
+ MUL a2, b4, t4
|
|
+ ldi BO, 4 * SIZE(BO)
|
|
+
|
|
+ ADD1 c09, t1, c09
|
|
+ ADD3 c10, t2, c10
|
|
+ ADD4 c13, t3, c13
|
|
+ ADD2 c14, t4, c14
|
|
+
|
|
+ ADD c01, c06, c01
|
|
+ ADD c02, c05, c02
|
|
+ ADD c09, c14, c09
|
|
+ ADD c10, c13, c10
|
|
+ .align 4
|
|
+
|
|
+$L28:
|
|
+#if defined(LN) || defined(RT)
|
|
+#ifdef LN
|
|
+ subl KK, 1, TMP1
|
|
+#else
|
|
+ subl KK, 2, TMP1
|
|
+#endif
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AORIG, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl B, TMP2, BO
|
|
+#else
|
|
+ ldi AO, -2 * SIZE(AO)
|
|
+ ldi BO, -4 * SIZE(BO)
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+#else
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+ LD a3, 2 * SIZE(AO)
|
|
+ LD a4, 3 * SIZE(AO)
|
|
+
|
|
+ SUB a1, c01, c01
|
|
+ SUB a2, c02, c02
|
|
+ SUB a3, c09, c09
|
|
+ SUB a4, c10, c10
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ LD a1, 0 * SIZE(AO)
|
|
+ LD a2, 1 * SIZE(AO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a2, c10, t3
|
|
+ MUL a2, c09, t4
|
|
+
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+ ADD5 c09, t3, c09
|
|
+ ADD6 c10, t4, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+ LD a3, 2 * SIZE(BO)
|
|
+ LD a4, 3 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+
|
|
+ MUL a3, c01, t1
|
|
+ MUL a3, c02, t2
|
|
+ SUB c09, t1, c09
|
|
+ SUB c10, t2, c10
|
|
+
|
|
+ MUL a4, c02, t1
|
|
+ MUL a4, c01, t2
|
|
+ ADD6 c09, t1, c09
|
|
+ ADD5 c10, t2, c10
|
|
+
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ LD a1, 6 * SIZE(BO)
|
|
+ LD a2, 7 * SIZE(BO)
|
|
+ LD a3, 4 * SIZE(BO)
|
|
+ LD a4, 5 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c10, t1
|
|
+ MUL a2, c09, t2
|
|
+ MUL a1, c09, c09
|
|
+ MUL a1, c10, c10
|
|
+
|
|
+ ADD5 c09, t1, c09
|
|
+ ADD6 c10, t2, c10
|
|
+
|
|
+ MUL a3, c09, t1
|
|
+ MUL a3, c10, t2
|
|
+ SUB c01, t1, c01
|
|
+ SUB c02, t2, c02
|
|
+
|
|
+ MUL a4, c10, t1
|
|
+ MUL a4, c09, t2
|
|
+ ADD6 c01, t1, c01
|
|
+ ADD5 c02, t2, c02
|
|
+
|
|
+ LD a1, 0 * SIZE(BO)
|
|
+ LD a2, 1 * SIZE(BO)
|
|
+
|
|
+ MUL a2, c02, t1
|
|
+ MUL a2, c01, t2
|
|
+ MUL a1, c01, c01
|
|
+ MUL a1, c02, c02
|
|
+
|
|
+ ADD5 c01, t1, c01
|
|
+ ADD6 c02, t2, c02
|
|
+#endif
|
|
+
|
|
+#if defined(LN) || defined(LT)
|
|
+ ST c01, 0 * SIZE(BO)
|
|
+ ST c02, 1 * SIZE(BO)
|
|
+ ST c09, 2 * SIZE(BO)
|
|
+ ST c10, 3 * SIZE(BO)
|
|
+#else
|
|
+ ST c01, 0 * SIZE(AO)
|
|
+ ST c02, 1 * SIZE(AO)
|
|
+ ST c09, 2 * SIZE(AO)
|
|
+ ST c10, 3 * SIZE(AO)
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ ldi C1, -2 * SIZE(C1)
|
|
+ ldi C2, -2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+ ST c01, 0 * SIZE(C1)
|
|
+ ST c02, 1 * SIZE(C1)
|
|
+ ST c09, 0 * SIZE(C2)
|
|
+ ST c10, 1 * SIZE(C2)
|
|
+
|
|
+#ifndef LN
|
|
+ ldi C1, 2 * SIZE(C1)
|
|
+ ldi C2, 2 * SIZE(C2)
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ sll K, ZBASE_SHIFT, TMP1
|
|
+ addl AORIG, TMP1, AORIG
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ subl K, KK, TMP1
|
|
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
|
|
+ addl AO, TMP2, AO
|
|
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
|
|
+ addl BO, TMP2, BO
|
|
+#endif
|
|
+
|
|
+#ifdef LT
|
|
+ addl KK, 1, KK
|
|
+#endif
|
|
+
|
|
+#ifdef LN
|
|
+ subl KK, 1, KK
|
|
+#endif
|
|
+ .align 4
|
|
+
|
|
+$L29:
|
|
+#ifdef LN
|
|
+ sll K, ZBASE_SHIFT + 1, TMP1
|
|
+ addl B, TMP1, B
|
|
+#endif
|
|
+
|
|
+#if defined(LT) || defined(RN)
|
|
+ mov BO, B
|
|
+#endif
|
|
+
|
|
+#ifdef RN
|
|
+ addl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+#ifdef RT
|
|
+ subl KK, 2, KK
|
|
+#endif
|
|
+
|
|
+ ldi J, -1(J)
|
|
+ bgt J, $L01
|
|
+ .align 4
|
|
+
|
|
+$L999:
|
|
+ fldd $f2, 0($sp)
|
|
+ fldd $f3, 8($sp)
|
|
+ fldd $f4, 16($sp)
|
|
+ fldd $f5, 24($sp)
|
|
+ fldd $f6, 32($sp)
|
|
+ fldd $f7, 40($sp)
|
|
+ fldd $f8, 48($sp)
|
|
+ fldd $f9, 56($sp)
|
|
+ clr $0
|
|
+ ldi $sp, STACKSIZE($sp)
|
|
+ ret
|
|
+ .ident VERSION
|
|
+ .end CNAME
|
|
diff --git a/lapack/laswp/sw_64/Makefile b/lapack/laswp/sw_64/Makefile
|
|
new file mode 100644
|
|
index 0000000..af1f019
|
|
--- /dev/null
|
|
+++ b/lapack/laswp/sw_64/Makefile
|
|
@@ -0,0 +1,8 @@
|
|
+TOPDIR = ../../..
|
|
+include ../../../Makefile.system
|
|
+
|
|
+LASWP = ../generic/laswp_k_1.c
|
|
+ZLASWP = ../generic/zlaswp_k_1.c
|
|
+
|
|
+include ../generic/Makefile
|
|
+
|
|
diff --git a/param.h b/param.h
|
|
index ee4640f..1a5f361 100644
|
|
--- a/param.h
|
|
+++ b/param.h
|
|
@@ -2128,7 +2128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#endif
|
|
|
|
-#if defined(EV4) || defined(EV5) || defined(EV6)
|
|
+#if defined(EV4) || defined(EV5) || defined(SW6)
|
|
|
|
#ifdef EV4
|
|
#define SNUMOPT 1
|
|
@@ -2140,7 +2140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#define GEMM_DEFAULT_OFFSET_A 512
|
|
#define GEMM_DEFAULT_OFFSET_B 512
|
|
-#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
|
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
|
+//#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
|
|
|
#define SGEMM_DEFAULT_UNROLL_M 4
|
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
|
@@ -2185,7 +2186,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#define ZGEMM_DEFAULT_Q 64
|
|
#endif
|
|
|
|
-#ifdef EV6
|
|
+#ifdef SW6
|
|
#define SGEMM_DEFAULT_P 256
|
|
#define SGEMM_DEFAULT_Q 512
|
|
|
|
--
|
|
2.31.1
|
|
|