openblas/OpenBLAS-0.3.25-sw.patch
2025-02-25 13:44:09 +08:00

100269 lines
1.7 MiB

diff --git a/Makefile b/Makefile
index fc021a9..c33edd9 100644
--- a/Makefile
+++ b/Makefile
@@ -158,18 +158,18 @@ tests : shared
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
- $(MAKE) -C test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
+ $(MAKE) -C test all
endif
endif
ifneq ($(ONLY_CBLAS), 1)
- $(MAKE) -C utest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
+ #$(MAKE) -C utest all
endif
ifneq ($(NO_CBLAS), 1)
ifneq ($(ONLY_CBLAS), 1)
- $(MAKE) -C ctest FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
+ $(MAKE) -C ctest all
endif
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
- $(MAKE) -C cpp_thread_test FC="$(FC)" CC="$(CC)" COMMON_OPT="$(COMMON_OPT)" FCOMMON_OPT="$(FCOMMON_OPT)" all
+ $(MAKE) -C cpp_thread_test all
endif
endif
diff --git a/Makefile.sw_64 b/Makefile.sw_64
new file mode 100644
index 0000000..b4542ce
--- /dev/null
+++ b/Makefile.sw_64
@@ -0,0 +1,35 @@
+CPP = $(CC) -E
+RANLIB = ranlib
+
+ifeq ($(LIBSUBARCH), SW6)
+LIBNAME = $(LIBPREFIX)_sw6.a
+LIBNAME_P = $(LIBPREFIX)_sw6_p.a
+endif
+
+ifneq ($(COMPILER), NATIVE)
+# GCC User
+ifeq ($(LIBSUBARCH), SW6)
+OPTION += -DSW6 -mcpu=sw6
+endif
+else
+# Compaq Compiler User
+ifeq ($(LIBSUBARCH), SW6)
+OPTION += -DSW6 -tune sw6 -arch sw6
+endif
+endif
+
+ifeq ($(F_COMPILER), GFORTRAN)
+FCOMMON_OPT += -mieee
+endif
+
+ifeq ($(F_COMPILER), G77)
+FCOMMON_OPT += -mieee
+endif
+
+ifndef SMP
+LIBCXML = -lcxml -lots -lm
+LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm
+else
+LIBCXML = -lcxmlp -lots -lm
+LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm
+endif
diff --git a/Makefile.system b/Makefile.system
index 3be47c6..ae90af3 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -42,6 +42,8 @@ else ifeq ($(ARCH), mips64el)
override ARCH=mips64
else ifeq ($(ARCH), zarch)
override ARCH=zarch
+else ifeq ($(ARCH), sw_64)
+override ARCH=sw_64
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
@@ -809,6 +811,11 @@ NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
+ifeq ($(ARCH), sw_64)
+NO_BINARY_MODE = 1
+BINARY_DEFINED = 1
+endif
+
ifeq ($(ARCH), arm)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
diff --git a/Makefile.system.libname b/Makefile.system.libname
deleted file mode 100644
index 1b84195..0000000
--- a/Makefile.system.libname
+++ /dev/null
@@ -1,1860 +0,0 @@
-#
-# Include user definition
-#
-
-# TO suppress recursive includes
-INCLUDED = 1
-
-ifndef TOPDIR
-TOPDIR = .
-endif
-
-ifndef RELAPACK_REPLACE
-RELAPACK_REPLACE=0
-endif
-
-# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
-HOSTARCH := $(shell uname -m)
-ifeq ($(HOSTARCH), amd64)
-HOSTARCH=x86_64
-endif
-
-# Catch conflicting usage of ARCH in some BSD environments
-ifeq ($(ARCH), amd64)
-override ARCH=x86_64
-else ifeq ($(ARCH), powerpc64)
-override ARCH=power
-else ifeq ($(ARCH), powerpc64le)
-override ARCH=power
-else ifeq ($(ARCH), powerpc)
-override ARCH=power
-else ifeq ($(ARCH), i386)
-override ARCH=x86
-else ifeq ($(ARCH), armv6)
-override ARCH=arm
-else ifeq ($(ARCH), armv7)
-override ARCH=arm
-else ifeq ($(ARCH), aarch64)
-override ARCH=arm64
-else ifeq ($(ARCH), mipsel)
-override ARCH=mips
-else ifeq ($(ARCH), mips64el)
-override ARCH=mips64
-else ifeq ($(ARCH), zarch)
-override ARCH=zarch
-endif
-
-NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
-
-# Default C compiler
-# - Only set if not specified on the command line or inherited from the environment.
-# - CC is an implicit variable so neither '?=' or 'ifndef' can be used.
-# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
-# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
-ifeq ($(origin CC),default)
-
-# Check if $(CC) refers to a valid command and set the value to gcc if not
-ifneq ($(findstring cmd.exe,$(SHELL)),)
-ifeq ($(shell where $(CC) 2>NUL),)
-CC = gcc
-endif
-else # POSIX-ish
-ifeq ($(shell command -v $(CC) 2>/dev/null),)
-ifeq ($(shell uname -s),Darwin)
-CC = clang
-# EXTRALIB += -Wl,-no_compact_unwind
-else
-CC = gcc
-endif # Darwin
-endif # CC exists
-endif # Shell is sane
-
-endif # CC is set to default
-
-# Default Fortran compiler (FC) is selected by f_check.
-
-ifndef MAKEFILE_RULE
-include $(TOPDIR)/Makefile.rule
-else
-include $(TOPDIR)/$(MAKEFILE_RULE)
-endif
-
-#
-# Beginning of system configuration
-#
-ifneq ($(BUILD_SINGLE),1)
-ifneq ($(BUILD_DOUBLE),1)
-ifneq ($(BUILD_COMPLEX),1)
-ifneq ($(BUILD_COMPLEX16),1)
-override BUILD_SINGLE=1
-override BUILD_DOUBLE=1
-override BUILD_COMPLEX=1
-override BUILD_COMPLEX16=1
-endif
-endif
-endif
-endif
-
-ifndef HOSTCC
-HOSTCC = $(CC)
-endif
-
-ifdef TARGET
-GETARCH_FLAGS := -DFORCE_$(TARGET)
-GETARCH_FLAGS += -DUSER_TARGET
-ifeq ($(TARGET), GENERIC)
-ifeq ($(DYNAMIC_ARCH), 1)
-override NO_EXPRECISION=1
-export NO_EXPRECISION
-endif
-endif
-endif
-
-# Force fallbacks for 32bit
-
-ifeq ($(BINARY), 32)
-ifeq ($(TARGET), HASWELL)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET), SKYLAKEX)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET), COOPERLAKE)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET), SAPPHIRERAPIDS)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET), SANDYBRIDGE)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET), BULLDOZER)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET), PILEDRIVER)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET), STEAMROLLER)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET), EXCAVATOR)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET), ZEN)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET), ARMV8)
-GETARCH_FLAGS := -DFORCE_ARMV7
-endif
-ifeq ($(TARGET), POWER8)
-GETARCH_FLAGS := -DFORCE_POWER6
-endif
-ifeq ($(TARGET), POWER9)
-GETARCH_FLAGS := -DFORCE_POWER6
-endif
-ifeq ($(TARGET), POWER10)
-GETARCH_FLAGS := -DFORCE_POWER6
-endif
-endif
-
-#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
-#
-ifdef TARGET_CORE
-GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
-endif
-
-# Force fallbacks for 32bit
-
-ifeq ($(BINARY), 32)
-ifeq ($(TARGET_CORE), HASWELL)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET_CORE), SKYLAKEX)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET_CORE), COOPERLAKE)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET_CORE), SANDYBRIDGE)
-GETARCH_FLAGS := -DFORCE_NEHALEM
-endif
-ifeq ($(TARGET_CORE), BULLDOZER)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET_CORE), PILEDRIVER)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET_CORE), STEAMROLLER)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET_CORE), EXCAVATOR)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-ifeq ($(TARGET_CORE), ZEN)
-GETARCH_FLAGS := -DFORCE_BARCELONA
-endif
-endif
-
-
-# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
-ifeq ($(HOSTARCH), x86_64)
-ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
-GETARCH_FLAGS += -march=native
-endif
-endif
-
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-GETARCH_FLAGS += -DUSE64BITINT
-endif
-endif
-
-ifndef GEMM_MULTITHREAD_THRESHOLD
-GEMM_MULTITHREAD_THRESHOLD=4
-endif
-GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
-
-ifeq ($(NO_AVX), 1)
-GETARCH_FLAGS += -DNO_AVX
-endif
-
-ifeq ($(BINARY), 32)
-GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
-NO_AVX512 = 1
-endif
-
-ifeq ($(NO_AVX2), 1)
-GETARCH_FLAGS += -DNO_AVX2
-endif
-
-ifeq ($(NO_AVX512), 1)
-GETARCH_FLAGS += -DNO_AVX512
-endif
-
-ifeq ($(DEBUG), 1)
-GETARCH_FLAGS += -g
-endif
-
-ifeq ($(QUIET_MAKE), 1)
-MAKE += -s
-endif
-
-ifndef NO_PARALLEL_MAKE
-NO_PARALLEL_MAKE=0
-endif
-GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
-
-ifdef MAKE_NB_JOBS
-GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS)
-endif
-
-ifeq ($(HOSTCC), loongcc)
-GETARCH_FLAGS += -static
-endif
-
-#if don't use Fortran, it will only compile CBLAS.
-ifeq ($(ONLY_CBLAS), 1)
-NO_LAPACK = 1
-else
-ONLY_CBLAS = 0
-endif
-
-#For small matrix optimization
-ifeq ($(ARCH), x86_64)
-SMALL_MATRIX_OPT = 1
-else ifeq ($(ARCH), power)
-SMALL_MATRIX_OPT = 1
-BUILD_BFLOAT16 = 1
-endif
-ifeq ($(SMALL_MATRIX_OPT), 1)
-CCOMMON_OPT += -DSMALL_MATRIX_OPT
-endif
-
-# This operation is expensive, so execution should be once.
-ifndef GOTOBLAS_MAKEFILE
-export GOTOBLAS_MAKEFILE = 1
-
-# Generating Makefile.conf and config.h
-DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
-
-endif
-
-ifndef TARGET_CORE
--include $(TOPDIR)/Makefile.conf
-else
-HAVE_NEON=
-HAVE_VFP=
-HAVE_VFPV3=
-HAVE_VFPV4=
-HAVE_MMX=
-HAVE_SSE=
-HAVE_SSE2=
-HAVE_SSE3=
-HAVE_SSSE3=
-HAVE_SSE4_1=
-HAVE_SSE4_2=
-HAVE_SSE4A=
-HAVE_SSE5=
-HAVE_AVX=
-HAVE_AVX2=
-HAVE_FMA3=
-include $(TOPDIR)/Makefile_kernel.conf
-endif
-
-
-ifndef NUM_PARALLEL
-NUM_PARALLEL = 1
-endif
-
-ifndef NUM_THREADS
-NUM_THREADS = $(NUM_CORES)
-endif
-
-ifeq ($(NUM_THREADS), 1)
-override USE_THREAD = 0
-override USE_OPENMP = 0
-endif
-
-ifdef USE_THREAD
-ifeq ($(USE_THREAD), 0)
-SMP =
-else
-SMP = 1
-endif
-else
-ifeq ($(NUM_THREADS), 1)
-SMP =
-else
-SMP = 1
-endif
-endif
-
-ifeq ($(SMP), 1)
-USE_LOCKING =
-endif
-
-ifndef NEED_PIC
-NEED_PIC = 1
-endif
-
-ARFLAGS =
-CPP = $(COMPILER) -E
-AR ?= $(CROSS_SUFFIX)ar
-AS ?= $(CROSS_SUFFIX)as
-LD ?= $(CROSS_SUFFIX)ld
-RANLIB ?= $(CROSS_SUFFIX)ranlib
-NM = $(CROSS_SUFFIX)nm
-DLLWRAP = $(CROSS_SUFFIX)dllwrap
-OBJCOPY = $(CROSS_SUFFIX)objcopy
-OBJCONV = $(CROSS_SUFFIX)objconv
-
-
-# When fortran support was either not detected or actively deselected, only build BLAS.
-ifeq ($(NOFORTRAN), 1)
-C_LAPACK = 1
-override FEXTRALIB =
-endif
-
-ifeq ($(C_COMPILER), GCC)
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
-GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
-GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
-GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
-GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
-GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-# Note that the behavior of -dumpversion is compile-time-configurable for
-# gcc-7.x and newer. Use -dumpfullversion there
-ifeq ($(GCCVERSIONGTEQ7),1)
- GCCDUMPVERSION_PARAM := -dumpfullversion
-else
- GCCDUMPVERSION_PARAM := -dumpversion
-endif
-GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
-GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
-GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
-endif
-
-ifeq ($(C_COMPILER), CLANG)
-CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
-CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
-endif
-
-#
-# OS dependent settings
-#
-
-ifeq ($(OSNAME), Darwin)
-ifndef MACOSX_DEPLOYMENT_TARGET
-ifeq ($(ARCH), arm64)
-export MACOSX_DEPLOYMENT_TARGET=11.0
-ifeq ($(C_COMPILER), GCC)
-export NO_SVE = 1
-endif
-else
-export MACOSX_DEPLOYMENT_TARGET=10.8
-endif
-endif
-MD5SUM = md5 -r
-XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
-ifeq (x$(XCVER)x,xx)
-XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
-endif
-ifeq (x$(XCVER), x 15)
-CCOMMON_OPT += -Wl,-ld_classic
-endif
-endif
-
-ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
-MD5SUM = md5 -r
-endif
-
-ifeq ($(OSNAME), NetBSD)
-MD5SUM = md5 -n
-endif
-
-ifeq ($(OSNAME), Linux)
-EXTRALIB += -lm
-NO_EXPRECISION = 1
-endif
-
-ifeq ($(OSNAME), Android)
-EXTRALIB += -lm
-endif
-
-ifeq ($(OSNAME), AIX)
-EXTRALIB += -lm
-endif
-
-ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
-ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
-EXTRALIB += -lm
-endif
-endif
-
-ifeq ($(OSNAME), WINNT)
-NEED_PIC = 0
-NO_EXPRECISION = 1
-
-EXTRALIB += -defaultlib:advapi32
-
-SUFFIX = obj
-PSUFFIX = pobj
-LIBSUFFIX = a
-
-ifeq ($(C_COMPILER), CLANG)
-CCOMMON_OPT += -DMS_ABI
-endif
-
-#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
-ifeq ($(GCCVERSIONGT4), 1)
-# GCC Major version > 4
-# It is compatible with MSVC ABI.
-CCOMMON_OPT += -DMS_ABI
-endif
-
-ifeq ($(GCCVERSIONGTEQ4), 1)
-ifeq ($(GCCMINORVERSIONGTEQ7), 1)
-# GCC Version >=4.7
-# It is compatible with MSVC ABI.
-CCOMMON_OPT += -DMS_ABI
-endif
-endif
-
-# Ensure the correct stack alignment on Win32
-# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
-ifeq ($(ARCH), x86)
-CCOMMON_OPT += -mincoming-stack-boundary=2
-FCOMMON_OPT += -mincoming-stack-boundary=2
-endif
-
-endif
-
-ifeq ($(OSNAME), Interix)
-NEED_PIC = 0
-NO_EXPRECISION = 1
-
-INTERIX_TOOL_DIR = /opt/gcc.3.3/i586-pc-interix3/bin
-endif
-
-ifeq ($(OSNAME), CYGWIN_NT)
-NEED_PIC = 0
-NO_EXPRECISION = 1
-OS_CYGWIN_NT = 1
-endif
-
-ifneq ($(OSNAME), WINNT)
-ifneq ($(OSNAME), CYGWIN_NT)
-ifneq ($(OSNAME), Interix)
-ifneq ($(OSNAME), Android)
-ifdef SMP
-EXTRALIB += -lpthread
-endif
-endif
-endif
-endif
-endif
-
-# ifeq logical or
-ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
-OS_WINDOWS=1
-endif
-
-ifdef QUAD_PRECISION
-CCOMMON_OPT += -DQUAD_PRECISION
-NO_EXPRECISION = 1
-endif
-
-ifneq ($(ARCH), x86)
-ifneq ($(ARCH), x86_64)
-NO_EXPRECISION = 1
-endif
-endif
-
-ifdef UTEST_CHECK
-CCOMMON_OPT += -DUTEST_CHECK
-SANITY_CHECK = 1
-endif
-
-ifdef SANITY_CHECK
-CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
-endif
-
-MAX_STACK_ALLOC ?= 2048
-ifneq ($(MAX_STACK_ALLOC), 0)
-CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
-endif
-
-ifdef USE_LOCKING
-ifneq ($(USE_LOCKING), 0)
-CCOMMON_OPT += -DUSE_LOCKING
-endif
-endif
-
-#
-# Architecture dependent settings
-#
-
-ifeq ($(ARCH), x86)
-ifndef BINARY
-NO_BINARY_MODE = 1
-endif
-
-ifeq ($(CORE), generic)
-NO_EXPRECISION = 1
-endif
-
-ifndef NO_EXPRECISION
-ifeq ($(F_COMPILER), GFORTRAN)
-# ifeq logical or. GCC or LSB
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
-EXPRECISION = 1
-CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
-FCOMMON_OPT += -m128bit-long-double
-endif
-ifeq ($(C_COMPILER), CLANG)
-EXPRECISION = 1
-CCOMMON_OPT += -DEXPRECISION
-FCOMMON_OPT += -m128bit-long-double
-endif
-endif
-endif
-endif
-
-ifeq ($(ARCH), x86_64)
-
-ifeq ($(CORE), generic)
-NO_EXPRECISION = 1
-endif
-
-ifndef NO_EXPRECISION
-ifeq ($(F_COMPILER), GFORTRAN)
-# ifeq logical or. GCC or LSB
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
-EXPRECISION = 1
-CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
-FCOMMON_OPT += -m128bit-long-double
-endif
-ifeq ($(C_COMPILER), CLANG)
-EXPRECISION = 1
-CCOMMON_OPT += -DEXPRECISION
-FCOMMON_OPT += -m128bit-long-double
-endif
-endif
-endif
-endif
-
-ifeq ($(C_COMPILER), INTEL)
-CCOMMON_OPT += -wd981
-endif
-
-
-ifeq ($(USE_OPENMP), 1)
-
-#check
-ifeq ($(USE_THREAD), 0)
-$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
-endif
-
-# ifeq logical or. GCC or LSB
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
-CCOMMON_OPT += -fopenmp
-endif
-
-ifeq ($(C_COMPILER), CLANG)
-CCOMMON_OPT += -fopenmp
-ifeq ($(F_COMPILER), GFORTRAN)
-FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB))
-endif
-endif
-
-ifeq ($(C_COMPILER), INTEL)
-CCOMMON_OPT += -fopenmp
-endif
-
-ifeq ($(C_COMPILER), PGI)
-CCOMMON_OPT += -mp
-endif
-
-ifeq ($(C_COMPILER), OPEN64)
-CCOMMON_OPT += -mp
-CEXTRALIB += -lstdc++
-endif
-
-ifeq ($(C_COMPILER), PATHSCALE)
-CCOMMON_OPT += -mp
-endif
-endif
-
-
-ifeq ($(DYNAMIC_ARCH), 1)
-ifeq ($(ARCH), x86)
-DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
- CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
-endif
-
-ifeq ($(ARCH), x86_64)
-DYNAMIC_CORE = PRESCOTT CORE2
-ifeq ($(DYNAMIC_OLDER), 1)
-DYNAMIC_CORE += PENRYN DUNNINGTON
-endif
-DYNAMIC_CORE += NEHALEM
-ifeq ($(DYNAMIC_OLDER), 1)
-DYNAMIC_CORE += OPTERON OPTERON_SSE3
-endif
-DYNAMIC_CORE += BARCELONA
-ifeq ($(DYNAMIC_OLDER), 1)
-DYNAMIC_CORE += BOBCAT ATOM NANO
-endif
-ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
-endif
-ifneq ($(NO_AVX2), 1)
-DYNAMIC_CORE += HASWELL ZEN
-endif
-ifneq ($(NO_AVX512), 1)
-ifneq ($(NO_AVX2), 1)
-DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
-endif
-endif
-endif
-
-ifdef DYNAMIC_LIST
-override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
-CCOMMON_OPT += $(XCCOMMON_OPT)
-#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
-endif
-
-ifeq ($(ARCH), arm64)
-DYNAMIC_CORE = ARMV8
-DYNAMIC_CORE += CORTEXA53
-DYNAMIC_CORE += CORTEXA57
-DYNAMIC_CORE += CORTEXA72
-DYNAMIC_CORE += CORTEXA73
-DYNAMIC_CORE += NEOVERSEN1
-ifneq ($(NO_SVE), 1)
-DYNAMIC_CORE += NEOVERSEV1
-DYNAMIC_CORE += NEOVERSEN2
-DYNAMIC_CORE += ARMV8SVE
-endif
-DYNAMIC_CORE += CORTEXA55
-DYNAMIC_CORE += FALKOR
-DYNAMIC_CORE += THUNDERX
-DYNAMIC_CORE += THUNDERX2T99
-DYNAMIC_CORE += TSV110
-DYNAMIC_CORE += EMAG8180
-DYNAMIC_CORE += THUNDERX3T110
-ifdef DYNAMIC_LIST
-override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST)
-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8
-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
-endif
-endif
-
-ifeq ($(ARCH), mips64)
-DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 MIPS64_GENERIC
-ifdef DYNAMIC_LIST
-override DYNAMIC_CORE = MIPS64_GENERIC $(DYNAMIC_LIST)
-XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_MIPS64_GENERIC
-XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
-endif
-endif
-
-ifeq ($(ARCH), loongarch64)
-DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
-endif
-
-ifeq ($(ARCH), zarch)
-DYNAMIC_CORE = ZARCH_GENERIC
-
-# if the compiler accepts -march=arch11 or -march=z13 and can compile a file
-# with z13-specific inline assembly, then we can include support for Z13.
-# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases
-# only support one or the other.
-# note: LLVM version 6.x supported -march=z13 yet could not handle vector
-# registers in inline assembly, so the check for supporting the -march flag is
-# not enough.
-ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null
-ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1)
-ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1)
-
-ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1)
-DYNAMIC_CORE += Z13
-CCOMMON_OPT += -DDYN_Z13
-else
-$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it)
-endif
-
-# as above for z13, check for -march=arch12 and z14 support in the compiler.
-ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1)
-ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1)
-ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1)
-DYNAMIC_CORE += Z14
-CCOMMON_OPT += -DDYN_Z14
-else
-$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it)
-endif
-
-endif # ARCH zarch
-
-ifeq ($(ARCH), power)
-ifneq ($(C_COMPILER), PGI)
-DYNAMIC_CORE = POWER6
-DYNAMIC_CORE += POWER8
-ifneq ($(C_COMPILER), GCC)
-DYNAMIC_CORE += POWER9
-DYNAMIC_CORE += POWER10
-CCOMMON_OPT += -DHAVE_P10_SUPPORT
-endif
-ifeq ($(C_COMPILER), GCC)
-ifeq ($(GCCVERSIONGT5), 1)
-DYNAMIC_CORE += POWER9
-else
-$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
-endif
-ifeq ($(OSNAME), AIX)
-LDVERSIONGTEQ35 := 1
-else
-LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
-endif
-ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
-DYNAMIC_CORE += POWER10
-CCOMMON_OPT += -DHAVE_P10_SUPPORT
-else ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
-DYNAMIC_CORE += POWER10
-CCOMMON_OPT += -DHAVE_P10_SUPPORT
-endif
-else
-$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
-endif
-endif
-else
-DYNAMIC_CORE = POWER8
-DYNAMIC_CORE += POWER9
-endif
-endif
-
-# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
-ifndef DYNAMIC_CORE
-override DYNAMIC_ARCH=
-endif
-endif
-
-ifeq ($(ARCH), ia64)
-NO_BINARY_MODE = 1
-BINARY_DEFINED = 1
-
-ifeq ($(F_COMPILER), GFORTRAN)
-ifeq ($(C_COMPILER), GCC)
-# EXPRECISION = 1
-# CCOMMON_OPT += -DEXPRECISION
-endif
-endif
-endif
-
-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
-NO_BINARY_MODE = 1
-endif
-
-ifeq ($(ARCH), alpha)
-NO_BINARY_MODE = 1
-BINARY_DEFINED = 1
-endif
-
-ifeq ($(ARCH), arm)
-NO_BINARY_MODE = 1
-BINARY_DEFINED = 1
-
-CCOMMON_OPT += -marm
-FCOMMON_OPT += -marm
-
-# If softfp abi is mentioned on the command line, force it.
-ifeq ($(ARM_SOFTFP_ABI), 1)
-CCOMMON_OPT += -mfloat-abi=softfp
-FCOMMON_OPT += -mfloat-abi=softfp
-endif
-
-ifeq ($(OSNAME), Android)
-ifeq ($(ARM_SOFTFP_ABI), 1)
-EXTRALIB += -lm
-else
-EXTRALIB += -Wl,-lm_hard
-endif
-endif
-endif
-
-ifeq ($(ARCH), arm64)
-NO_BINARY_MODE = 1
-BINARY_DEFINED = 1
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-ifeq ($(F_COMPILER), GFORTRAN)
-FCOMMON_OPT += -fdefault-integer-8
-endif
-ifeq ($(F_COMPILER), FLANG)
-FCOMMON_OPT += -i8
-endif
-endif
-endif
-endif
-
-ifeq ($(ARCH), riscv64)
-NO_BINARY_MODE = 1
-BINARY_DEFINED = 1
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-ifeq ($(F_COMPILER), GFORTRAN)
-FCOMMON_OPT += -fdefault-integer-8
-endif
-ifeq ($(F_COMPILER), FLANG)
-FCOMMON_OPT += -i8
-endif
-endif
-endif
-endif
-
-ifeq ($(ARCH), loongarch64)
-NO_BINARY_MODE = 1
-BINARY_DEFINED = 1
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-ifeq ($(F_COMPILER), GFORTRAN)
-FCOMMON_OPT += -fdefault-integer-8
-endif
-ifeq ($(F_COMPILER), FLANG)
-FCOMMON_OPT += -i8
-endif
-endif
-endif
-endif
-
-#
-# C Compiler dependent settings
-#
-
-
-# ifeq logical or. GCC or CLANG or LSB
-# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB))
-CCOMMON_OPT += -Wall
-COMMON_PROF += -fno-inline
-NO_UNINITIALIZED_WARN = -Wno-uninitialized
-
-ifeq ($(QUIET_MAKE), 1)
-CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused
-endif
-
-ifdef NO_BINARY_MODE
-
-ifeq ($(ARCH), $(filter $(ARCH),mips64))
-ifdef BINARY64
-CCOMMON_OPT += -mabi=64
-else
-CCOMMON_OPT += -mabi=n32
-endif
-BINARY_DEFINED = 1
-else ifeq ($(ARCH), $(filter $(ARCH),mips))
-CCOMMON_OPT += -mabi=32
-BINARY_DEFINED = 1
-endif
-
-ifneq (, $(filter $(CORE), MIPS64_GENERIC))
-CCOMMON_OPT += -DNO_MSA
-FCOMMON_OPT += -DNO_MSA
-endif
-
-ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
-CCOMMON_OPT += -march=loongson3a
-FCOMMON_OPT += -march=loongson3a
-endif
-
-ifeq ($(CORE), MIPS24K)
-CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS)
-FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS)
-endif
-
-ifeq ($(CORE), MIPS1004K)
-CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
-FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
-endif
-
-ifeq ($(CORE), P5600)
-CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
-FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
-endif
-
-ifeq ($(CORE), I6400)
-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
-endif
-
-ifeq ($(CORE), P6600)
-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
-endif
-
-ifeq ($(CORE), I6500)
-CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
-FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
-endif
-
-ifeq ($(OSNAME), AIX)
-BINARY_DEFINED = 1
-endif
-
-ifeq ($(ARCH), loongarch64)
-LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
-ifneq ($(LA64_ABI), lp64d)
-LA64_ABI=lp64
-endif
-CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
-FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
-endif
-
-endif
-
-ifndef BINARY_DEFINED
-ifneq ($(OSNAME), AIX)
-ifdef BINARY64
-ifneq ($(ARCH), riscv64)
-CCOMMON_OPT += -m64
-endif
-else
-CCOMMON_OPT += -m32
-endif
-endif
-endif
-
-endif
-
-ifeq ($(C_COMPILER), PGI)
-PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
-PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20)
-PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11)
-PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
-ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011))
-NEWPGI := 1
-PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21)
-PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21)
-PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11)
-ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011))
-NEWPGI2 := 1
-endif
-endif
-ifdef BINARY64
-ifeq ($(ARCH), x86_64)
-ifeq (,$(findstring tp,$(CFLAGS)))
-ifneq ($(NEWPGI2),1)
-CCOMMON_OPT += -tp p7-64
-else
-CCOMMON_OPT += -tp px
-endif
-endif
-ifneq ($(NEWPGI),1)
-CCOMMON_OPT += -D__MMX__ -Mnollvm
-endif
-else
-ifeq ($(ARCH), power)
-ifeq (,$(findstring tp,$(CFLAGS)))
-ifeq ($(CORE), POWER8)
-CCOMMON_OPT += -tp pwr8
-endif
-ifeq ($(CORE), POWER9)
-CCOMMON_OPT += -tp pwr9
-endif
-endif
-endif
-endif
-else
-ifneq ($(NEWPGI2),1)
-ifeq (,$(findstring tp,$(CFLAGS)))
-CCOMMON_OPT += -tp p7
-else
-CCOMMON_OPT += -tp px
-endif
-endif
-endif
-endif
-
-ifeq ($(C_COMPILER), PATHSCALE)
-ifdef BINARY64
-CCOMMON_OPT += -m64
-else
-CCOMMON_OPT += -m32
-endif
-endif
-
-#
-# Fortran Compiler dependent settings
-#
-
-ifeq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -i8
-endif
-endif
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -openmp
-endif
-endif
-
-ifeq ($(F_COMPILER), FLANG)
-CCOMMON_OPT += -DF_INTERFACE_FLANG
-FCOMMON_OPT += -Mrecursive -Kieee
-ifeq ($(OSNAME), Linux)
-ifeq ($(ARCH), x86_64)
-FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
-ifeq ($(FLANG_VENDOR), AMD)
-FCOMMON_OPT += -fno-unroll-loops
-endif
-endif
-endif
-ifdef BINARY64
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -i8
-endif
-endif
-FCOMMON_OPT += -Wall
-else
-FCOMMON_OPT += -Wall
-endif
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -fopenmp
-endif
-endif
-
-ifeq ($(F_COMPILER), G77)
-CCOMMON_OPT += -DF_INTERFACE_G77
-FCOMMON_OPT += -Wall
-ifndef NO_BINARY_MODE
-ifneq ($(OSNAME), AIX)
-ifdef BINARY64
-FCOMMON_OPT += -m64
-else
-FCOMMON_OPT += -m32
-endif
-endif
-endif
-endif
-
-ifeq ($(F_COMPILER), G95)
-CCOMMON_OPT += -DF_INTERFACE_G95
-FCOMMON_OPT += -Wall
-ifneq ($(OSNAME), AIX)
-ifndef NO_BINARY_MODE
-ifdef BINARY64
-FCOMMON_OPT += -m64
-else
-FCOMMON_OPT += -m32
-endif
-endif
-ifneq ($(NO_LAPACKE), 1)
-FCOMMON_OPT += -fno-second-underscore
-endif
-endif
-endif
-
-ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
-CCOMMON_OPT += -DF_INTERFACE_GFORT
-ifeq ($(F_COMPILER), GFORTRAN)
-FCOMMON_OPT += -Wall
-# make single-threaded LAPACK calls thread-safe #1847
-FCOMMON_OPT += -frecursive
-# work around ABI problem with passing single-character arguments
-FCOMMON_OPT += -fno-optimize-sibling-calls
-#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
-ifneq ($(NOFORTRAN), 1)
-ifneq ($(NOFORTRAN), 2)
-ifneq ($(NO_LAPACK), 1)
-EXTRALIB += -lgfortran
-endif
-endif
-endif
-endif
-ifdef NO_BINARY_MODE
-ifeq ($(ARCH), $(filter $(ARCH),mips64))
-ifdef BINARY64
-FCOMMON_OPT += -mabi=64
-else
-FCOMMON_OPT += -mabi=n32
-endif
-else ifeq ($(ARCH), $(filter $(ARCH),mips))
-FCOMMON_OPT += -mabi=32
-endif
-else
-ifdef BINARY64
-ifneq ($(OSNAME), AIX)
-ifneq ($(ARCH), riscv64)
-FCOMMON_OPT += -m64
-endif
-endif
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -fdefault-integer-8
-endif
-endif
-else
-ifneq ($(OSNAME), AIX)
-FCOMMON_OPT += -m32
-endif
-endif
-endif
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -fopenmp
-endif
-endif
-
-ifeq ($(F_COMPILER), INTEL)
-CCOMMON_OPT += -DF_INTERFACE_INTEL
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -i8
-endif
-endif
-FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -fopenmp
-endif
-endif
-
-ifeq ($(F_COMPILER), FUJITSU)
-CCOMMON_OPT += -DF_INTERFACE_FUJITSU
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -openmp
-endif
-endif
-
-ifeq ($(F_COMPILER), IBM)
-CCOMMON_OPT += -DF_INTERFACE_IBM
-FEXTRALIB += -lxlf90
-ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
-FCOMMON_OPT += -qextname
-endif
-# FCOMMON_OPT += -qarch=440
-ifdef BINARY64
-FCOMMON_OPT += -q64
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -qintsize=8
-endif
-endif
-else
-FCOMMON_OPT += -q32
-endif
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -openmp
-endif
-endif
-
-ifeq ($(F_COMPILER), PGI)
-CCOMMON_OPT += -DF_INTERFACE_PGI
-COMMON_PROF += -DPGICOMPILER
-ifdef BINARY64
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -i8
-endif
-endif
-ifeq ($(ARCH), x86_64)
-ifneq ($(NEWPGI2),1)
-FCOMMON_OPT += -tp p7-64
-else
-FCOMMON_OPT += -tp px
-endif
-else
-ifeq ($(ARCH), power)
-ifeq ($(CORE), POWER6)
-$(warning NVIDIA HPC compilers do not support POWER6.)
-endif
-ifeq ($(CORE), POWER8)
-FCOMMON_OPT += -tp pwr8
-endif
-ifeq ($(CORE), POWER9)
-FCOMMON_OPT += -tp pwr9
-endif
-ifeq ($(CORE), POWER10)
-$(warning NVIDIA HPC compilers do not support POWER10.)
-endif
-endif
-endif
-else
-FCOMMON_OPT += -tp p7
-endif
-FCOMMON_OPT += -Mrecursive -Kieee
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -mp
-endif
-endif
-
-ifeq ($(F_COMPILER), PATHSCALE)
-CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
-ifdef BINARY64
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -i8
-endif
-endif
-endif
-
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -mp
-endif
-endif
-
-ifeq ($(F_COMPILER), OPEN64)
-CCOMMON_OPT += -DF_INTERFACE_OPEN64
-ifdef BINARY64
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -i8
-endif
-endif
-endif
-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
-ifndef BINARY64
-FCOMMON_OPT += -n32
-else
-FCOMMON_OPT += -n64
-endif
-ifeq ($(CORE), LOONGSON3R3)
-FCOMMON_OPT += -loongson3 -static
-endif
-ifeq ($(CORE), LOONGSON3R4)
-FCOMMON_OPT += -loongson3 -static
-endif
-else
-ifndef BINARY64
-FCOMMON_OPT += -m32
-else
-FCOMMON_OPT += -m64
-endif
-endif
-ifeq ($(USE_OPENMP), 1)
-FEXTRALIB += -lstdc++
-FCOMMON_OPT += -mp
-endif
-endif
-
-ifeq ($(C_COMPILER), OPEN64)
-ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
-ifndef BINARY64
-CCOMMON_OPT += -n32
-else
-CCOMMON_OPT += -n64
-endif
-ifeq ($(CORE), LOONGSON3R3)
-CCOMMON_OPT += -loongson3 -static
-endif
-ifeq ($(CORE), LOONGSON3R4)
-CCOMMON_OPT += -loongson3 -static
-endif
-else
-ifndef BINARY64
-CCOMMON_OPT += -m32
-else
-CCOMMON_OPT += -m64
-endif
-endif
-endif
-
-ifeq ($(C_COMPILER), SUN)
-CCOMMON_OPT += -w
-ifeq ($(ARCH), x86)
-CCOMMON_OPT += -m32
-else
-ifdef BINARY64
-CCOMMON_OPT += -m64
-else
-CCOMMON_OPT += -m32
-endif
-endif
-endif
-
-ifeq ($(F_COMPILER), SUN)
-CCOMMON_OPT += -DF_INTERFACE_SUN
-FCOMMON_OPT += -ftrap=%none -xrecursive
-ifeq ($(ARCH), x86)
-FCOMMON_OPT += -m32
-else
-ifdef BINARY64
-FCOMMON_OPT += -m64
-else
-FCOMMON_OPT += -m32
-endif
-endif
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -xopenmp=parallel
-endif
-endif
-
-ifeq ($(F_COMPILER), COMPAQ)
-CCOMMON_OPT += -DF_INTERFACE_COMPAQ
-ifeq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -openmp
-endif
-endif
-
-ifeq ($(F_COMPILER), CRAY)
-CCOMMON_OPT += -DF_INTERFACE_INTEL
-FCOMMON_OPT += -hnopattern
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-FCOMMON_OPT += -s integer64
-endif
-endif
-ifneq ($(USE_OPENMP), 1)
-FCOMMON_OPT += -O noomp
-endif
-endif
-
-ifdef BINARY64
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-CCOMMON_OPT +=
-#-DUSE64BITINT
-endif
-endif
-endif
-
-ifeq ($(NEED_PIC), 1)
-ifeq ($(C_COMPILER), IBM)
-CCOMMON_OPT += -qpic=large
-else
-CCOMMON_OPT += -fPIC
-endif
-ifeq ($(F_COMPILER), SUN)
-FCOMMON_OPT += -pic
-else ifeq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -PIC
-else ifeq ($(F_COMPILER), IBM)
-FCOMMON_OPT += -qpic=large
-else
-FCOMMON_OPT += -fPIC
-endif
-endif
-
-ifeq ($(DYNAMIC_ARCH), 1)
-CCOMMON_OPT += -DDYNAMIC_ARCH
-endif
-
-ifeq ($(DYNAMIC_OLDER), 1)
-CCOMMON_OPT += -DDYNAMIC_OLDER
-endif
-
-ifeq ($(C_LAPACK), 1)
-CCOMMON_OPT += -DC_LAPACK
-endif
-
-ifeq ($(NO_LAPACK), 1)
-CCOMMON_OPT += -DNO_LAPACK
-#Disable LAPACK C interface
-NO_LAPACKE = 1
-endif
-
-ifeq ($(NO_LAPACKE), 1)
-CCOMMON_OPT += -DNO_LAPACKE
-endif
-
-ifeq ($(NO_AVX), 1)
-CCOMMON_OPT += -DNO_AVX
-endif
-
-ifeq ($(ARCH), x86)
-CCOMMON_OPT += -DNO_AVX
-endif
-
-ifeq ($(NO_AVX2), 1)
-CCOMMON_OPT += -DNO_AVX2
-endif
-
-ifeq ($(NO_AVX512), 1)
-CCOMMON_OPT += -DNO_AVX512
-endif
-
-ifeq ($(NO_SVE), 1)
-CCOMMON_OPT += -DNO_SVE
-endif
-
-ifdef SMP
-CCOMMON_OPT += -DSMP_SERVER
-
-ifeq ($(ARCH), mips64)
-USE_SIMPLE_THREADED_LEVEL3 = 1
-endif
-
-ifeq ($(USE_OPENMP), 1)
-# USE_SIMPLE_THREADED_LEVEL3 = 1
-# NO_AFFINITY = 1
-CCOMMON_OPT += -DUSE_OPENMP
-endif
-
-ifeq ($(BIGNUMA), 1)
-CCOMMON_OPT += -DBIGNUMA
-endif
-
-endif
-
-ifeq ($(NO_WARMUP), 1)
-CCOMMON_OPT += -DNO_WARMUP
-endif
-
-ifeq ($(CONSISTENT_FPCSR), 1)
-CCOMMON_OPT += -DCONSISTENT_FPCSR
-endif
-
-# Only for development
-# CCOMMON_OPT += -DPARAMTEST
-# CCOMMON_OPT += -DPREFETCHTEST
-# CCOMMON_OPT += -DNO_SWITCHING
-# USE_PAPI = 1
-
-ifdef USE_PAPI
-CCOMMON_OPT += -DUSE_PAPI
-EXTRALIB += -lpapi -lperfctr
-endif
-
-ifdef BUFFERSIZE
-CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE)
-endif
-
-ifdef DYNAMIC_THREADS
-CCOMMON_OPT += -DDYNAMIC_THREADS
-endif
-
-CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
-
-CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
-
-ifdef USE_SIMPLE_THREADED_LEVEL3
-CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
-endif
-
-ifeq ($(USE_TLS), 1)
-CCOMMON_OPT += -DUSE_TLS
-endif
-
-ifeq ($(BUILD_BFLOAT16), 1)
-CCOMMON_OPT += -DBUILD_BFLOAT16
-endif
-ifeq ($(BUILD_SINGLE), 1)
-CCOMMON_OPT += -DBUILD_SINGLE=1
-endif
-ifeq ($(BUILD_DOUBLE), 1)
-CCOMMON_OPT += -DBUILD_DOUBLE=1
-endif
-ifeq ($(BUILD_COMPLEX), 1)
-CCOMMON_OPT += -DBUILD_COMPLEX=1
-endif
-ifeq ($(BUILD_COMPLEX16), 1)
-CCOMMON_OPT += -DBUILD_COMPLEX16=1
-endif
-
-CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
-
-ifndef SYMBOLPREFIX
-SYMBOLPREFIX =
-endif
-
-ifndef SYMBOLSUFFIX
-SYMBOLSUFFIX =
-endif
-
-ifndef LIBSONAMEBASE
-LIBSONAMEBASE = openblas
-endif
-
-ifndef LIBNAMESUFFIX
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
-else
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
-endif
-
-ifeq ($(OSNAME), CYGWIN_NT)
-LIBPREFIX = cyg$(LIBNAMEBASE)
-else
-LIBPREFIX = lib$(LIBNAMEBASE)
-endif
-
-KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
-
-include $(TOPDIR)/Makefile.$(ARCH)
-
-ifneq ($(C_COMPILER), PGI)
-ifneq ($(C_COMPILER), SUN)
-CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
-endif
-endif
-CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
-
-ifeq ($(CORE), PPC440)
-CCOMMON_OPT += -DALLOC_QALLOC
-endif
-
-ifeq ($(CORE), PPC440FP2)
-STATIC_ALLOCATION = 1
-endif
-
-ifneq ($(OSNAME), Linux)
-NO_AFFINITY = 1
-endif
-
-ifneq ($(ARCH), x86_64)
-ifneq ($(ARCH), x86)
-NO_AFFINITY = 1
-endif
-endif
-
-ifdef NO_AFFINITY
-ifeq ($(NO_AFFINITY), 0)
-override undefine NO_AFFINITY
-else
-CCOMMON_OPT += -DNO_AFFINITY
-endif
-endif
-
-ifdef FUNCTION_PROFILE
-CCOMMON_OPT += -DFUNCTION_PROFILE
-endif
-
-ifdef HUGETLB_ALLOCATION
-CCOMMON_OPT += -DALLOC_HUGETLB
-endif
-
-ifdef HUGETLBFILE_ALLOCATION
-CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION)
-endif
-
-ifdef STATIC_ALLOCATION
-CCOMMON_OPT += -DALLOC_STATIC
-endif
-
-ifdef DEVICEDRIVER_ALLOCATION
-CCOMMON_OPT += -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\"
-endif
-
-ifdef MIXED_MEMORY_ALLOCATION
-CCOMMON_OPT += -DMIXED_MEMORY_ALLOCATION
-endif
-
-ifeq ($(OSNAME), SunOS)
-TAR = gtar
-PATCH = gpatch
-GREP = ggrep
-AWK = nawk
-else
-TAR = tar
-PATCH = patch
-GREP = grep
-AWK = awk
-endif
-
-ifndef MD5SUM
-MD5SUM = md5sum
-endif
-
-
-REVISION = -r$(VERSION)
-MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
-
-ifeq ($(DEBUG), 1)
-COMMON_OPT += -g
-endif
-
-ifeq ($(DEBUG), 1)
-FCOMMON_OPT += -g
-endif
-
-ifndef COMMON_OPT
-COMMON_OPT = -O2
-endif
-
-ifndef FCOMMON_OPT
-FCOMMON_OPT = -O2 -frecursive
-endif
-
-override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
-override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
-override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
-override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
-#MAKEOVERRIDES =
-
-ifeq ($(NEED_PIC), 1)
-ifeq (,$(findstring PIC,$(FFLAGS)))
-ifneq ($(F_COMPILER),IBM)
-override FFLAGS += -fPIC
-endif
-endif
-endif
-
-#For LAPACK Fortran codes.
-#Disable -fopenmp for LAPACK Fortran codes on Windows.
-ifdef OS_WINDOWS
-LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS))
-LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS))
-else
-LAPACK_FFLAGS := $(FFLAGS)
-LAPACK_FPFLAGS := $(FPFLAGS)
-endif
-
-ifeq ($(F_COMPILER),NAG)
-LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-endif
-ifeq ($(F_COMPILER),CRAY)
-LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-endif
-
-LAPACK_CFLAGS = $(CFLAGS)
-LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
-LAPACK_CFLAGS += -DLAPACK_ILP64
-endif
-endif
-
-ifdef OS_WINDOWS
-LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
-LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
-endif
-ifeq ($(C_COMPILER), LSB)
-LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
-endif
-
-ifndef SUFFIX
-SUFFIX = o
-endif
-
-ifndef PSUFFIX
-PSUFFIX = po
-endif
-
-ifndef LIBSUFFIX
-LIBSUFFIX = a
-endif
-
-ifneq ($(DYNAMIC_ARCH), 1)
-ifndef SMP
-LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX)
-LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX)
-else
-LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX)
-LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX)
-endif
-else
-ifndef SMP
-LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX)
-LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX)
-else
-LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX)
-LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
-endif
-endif
-
-
-LIBDLLNAME = $(LIBPREFIX).dll
-IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
-ifneq ($(OSNAME), AIX)
-LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
-else
-LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
-endif
-LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
-LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
-LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
-LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip)
-
-LIBS = $(TOPDIR)/$(LIBNAME)
-LIBS_P = $(TOPDIR)/$(LIBNAME_P)
-
-
-LIB_COMPONENTS = BLAS
-ifneq ($(NO_CBLAS), 1)
-LIB_COMPONENTS += CBLAS
-endif
-
-ifneq ($(NO_LAPACK), 1)
-LIB_COMPONENTS += LAPACK
-ifneq ($(NO_LAPACKE), 1)
-LIB_COMPONENTS += LAPACKE
-endif
-ifeq ($(BUILD_RELAPACK), 1)
-LIB_COMPONENTS += ReLAPACK
-endif
-endif
-
-ifeq ($(ONLY_CBLAS), 1)
-LIB_COMPONENTS = CBLAS
-endif
-
-export OSNAME
-export ARCH
-export CORE
-export LIBCORE
-export __BYTE_ORDER__
-export ELF_VERSION
-export PGCPATH
-export CONFIG
-export CC
-export FC
-export BU
-export FU
-export NEED2UNDERSCORES
-export USE_THREAD
-export NUM_THREADS
-export NUM_CORES
-export SMP
-export MAKEFILE_RULE
-export NEED_PIC
-export BINARY
-export BINARY32
-export BINARY64
-export F_COMPILER
-export C_COMPILER
-export USE_OPENMP
-export CROSS
-export CROSS_SUFFIX
-export NOFORTRAN
-export C_LAPACK
-export NO_FBLAS
-export EXTRALIB
-export CEXTRALIB
-export FEXTRALIB
-export HAVE_SSE
-export HAVE_SSE2
-export HAVE_SSE3
-export HAVE_SSSE3
-export HAVE_SSE4_1
-export HAVE_SSE4_2
-export HAVE_SSE4A
-export HAVE_SSE5
-export HAVE_AVX
-export HAVE_AVX2
-export HAVE_FMA3
-export HAVE_VFP
-export HAVE_VFPV3
-export HAVE_VFPV4
-export HAVE_NEON
-ifndef NO_MSA
- export HAVE_MSA
- export MSA_FLAGS
-endif
-export KERNELDIR
-export FUNCTION_PROFILE
-export TARGET_CORE
-export NO_AVX512
-export NO_AVX2
-export BUILD_BFLOAT16
-export NO_LSX
-export NO_LASX
-
-export SBGEMM_UNROLL_M
-export SBGEMM_UNROLL_N
-export SGEMM_UNROLL_M
-export SGEMM_UNROLL_N
-export DGEMM_UNROLL_M
-export DGEMM_UNROLL_N
-export QGEMM_UNROLL_M
-export QGEMM_UNROLL_N
-export CGEMM_UNROLL_M
-export CGEMM_UNROLL_N
-export ZGEMM_UNROLL_M
-export ZGEMM_UNROLL_N
-export XGEMM_UNROLL_M
-export XGEMM_UNROLL_N
-export CGEMM3M_UNROLL_M
-export CGEMM3M_UNROLL_N
-export ZGEMM3M_UNROLL_M
-export ZGEMM3M_UNROLL_N
-export XGEMM3M_UNROLL_M
-export XGEMM3M_UNROLL_N
-
-
-ifdef USE_CUDA
-export CUDADIR
-export CUCC
-export CUFLAGS
-export CULIB
-endif
-
-.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f
-
-.f.$(SUFFIX):
- $(FC) $(FFLAGS) -c $< -o $(@F)
-
-.f.$(PSUFFIX):
- $(FC) $(FPFLAGS) -pg -c $< -o $(@F)
-
-
-ifdef BINARY64
-PATHSCALEPATH = /opt/pathscale/lib/3.1
-PGIPATH = /opt/pgi/linux86-64/7.1-5/lib
-else
-PATHSCALEPATH = /opt/pathscale/lib/3.1/32
-PGIPATH = /opt/pgi/linux86/7.1-5/lib
-endif
-
-ACMLPATH = /opt/acml/4.3.0
-ifneq ($(OSNAME), Darwin)
-MKLPATH = /opt/intel/mkl/10.2.2.025/lib
-else
-MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib
-endif
-ATLASPATH = /opt/atlas/3.9.17/opteron
-FLAMEPATH = $(HOME)/flame/lib
-ifneq ($(OSNAME), SunOS)
-SUNPATH = /opt/sunstudio12.1
-else
-SUNPATH = /opt/SUNWspro
-endif
diff --git a/Makefile.tail b/Makefile.tail
index 54ba649..f73a86d 100644
--- a/Makefile.tail
+++ b/Makefile.tail
@@ -583,7 +583,7 @@ gen_insn_flash.c :
echo 'int i;' >> gen_insn_flash.c
echo '#ifdef __alpha' >> gen_insn_flash.c
echo 'printf(".set noat;.set noreorder;\n");' >> gen_insn_flash.c
- echo 'printf(".arch ev6;.text;.align 5\n");' >> gen_insn_flash.c
+ echo 'printf(".arch sw6;.text;.align 5\n");' >> gen_insn_flash.c
echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c
echo 'printf(".ent insn_flash\n");' >> gen_insn_flash.c
echo 'printf("insn_flash:\n");' >> gen_insn_flash.c
diff --git a/Makefile.tests b/Makefile.tests
deleted file mode 100644
index b344abc..0000000
--- a/Makefile.tests
+++ /dev/null
@@ -1,435 +0,0 @@
-TOPDIR = .
-include ./Makefile.system
-
-BLASDIRS = interface driver/level2 driver/level3 driver/others
-
-ifneq ($(DYNAMIC_ARCH), 1)
-BLASDIRS += kernel
-endif
-
-ifdef SANITY_CHECK
-BLASDIRS += reference
-endif
-
-SUBDIRS = $(BLASDIRS)
-ifneq ($(NO_LAPACK), 1)
-SUBDIRS += lapack
-endif
-
-RELA =
-ifeq ($(BUILD_RELAPACK), 1)
-RELA = re_lapack
-endif
-
-ifeq ($(NO_FORTRAN), 1)
-define NOFORTRAN
-1
-endef
-ifneq ($(NO_LAPACK), 1)
-define C_LAPACK
-1
-endef
-endif
-export NOFORTRAN
-export NO_LAPACK
-export C_LAPACK
-endif
-
-ifeq ($(F_COMPILER),CRAY)
-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS))
-else
-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
-endif
-
-SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
-
-.PHONY : all libs netlib $(RELA) test ctest shared install
-.NOTPARALLEL : shared
-
-all :: tests
- @echo
- @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
- @echo
- @echo " OS ... $(OSNAME) "
- @echo " Architecture ... $(ARCH) "
-ifndef BINARY64
- @echo " BINARY ... 32bit "
-else
- @echo " BINARY ... 64bit "
-endif
-
-ifdef INTERFACE64
-ifneq ($(INTERFACE64), 0)
- @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
-endif
-endif
- @$(CC) --version > /dev/null 2>&1;\
- if [ $$? -eq 0 ]; then \
- cverinfo=`$(CC) --version | sed -n '1p'`; \
- if [ -z "$${cverinfo}" ]; then \
- cverinfo=`$(CC) --version | sed -n '2p'`; \
- fi; \
- echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
- else \
- echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
- fi
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
- @$(FC) --version > /dev/null 2>&1;\
- if [ $$? -eq 0 ]; then \
- fverinfo=`$(FC) --version | sed -n '1p'`; \
- if [ -z "$${fverinfo}" ]; then \
- fverinfo=`$(FC) --version | sed -n '2p'`; \
- fi; \
- echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
- else \
- echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
- fi
-endif
-ifneq ($(OSNAME), AIX)
- @echo -n " Library Name ... $(LIBNAME)"
-else
- @echo " Library Name ... $(LIBNAME)"
-endif
-
-ifndef SMP
- @echo " (Single-threading) "
-else
- @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))"
-endif
-
-ifeq ($(DYNAMIC_ARCH), 1)
- @echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)"
-endif
-
-ifeq ($(USE_OPENMP), 1)
- @echo
- @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, "
- @echo " you should use OMP_NUM_THREADS environment variable to control the number of threads."
- @echo
-endif
-
-ifeq ($(OSNAME), Darwin)
- @echo "WARNING: If you plan to use the dynamic library $(LIBDYNNAME), you must run:"
- @echo
- @echo "\"make PREFIX=/your_installation_path/ install\"."
- @echo
- @echo "(or set PREFIX in Makefile.rule and run make install."
- @echo
- @echo "Note that any flags passed to make during build should also be passed to make install"
- @echo "to circumvent any install errors."
- @echo
- @echo "If you want to move the .dylib to a new location later, make sure you change"
- @echo "the internal name of the dylib with:"
- @echo
- @echo "install_name_tool -id /new/absolute/path/to/$(LIBDYNNAME) $(LIBDYNNAME)"
-endif
- @echo
- @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"."
- @echo
- @echo "Note that any flags passed to make during build should also be passed to make install"
- @echo "to circumvent any install errors."
- @echo
-
-shared : libs netlib $(RELA)
-ifneq ($(NO_SHARED), 1)
-ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
- @$(MAKE) -C exports so
- @ln -fs $(LIBSONAME) $(LIBPREFIX).so
- @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
-endif
-ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
- @$(MAKE) -C exports so
- @ln -fs $(LIBSONAME) $(LIBPREFIX).so
-endif
-ifeq ($(OSNAME), Darwin)
- @$(MAKE) -C exports dyn
- @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
- @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
-endif
-ifeq ($(OSNAME), WINNT)
- @$(MAKE) -C exports dll
-endif
-ifeq ($(OSNAME), CYGWIN_NT)
- @$(MAKE) -C exports dll
-endif
-endif
-
-tests : shared
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
- touch $(LIBNAME)
-ifndef NO_FBLAS
- $(MAKE) -C test all
-endif
-endif
-ifneq ($(ONLY_CBLAS), 1)
- $(MAKE) -C utest all
-endif
-ifneq ($(NO_CBLAS), 1)
-ifneq ($(ONLY_CBLAS), 1)
- $(MAKE) -C ctest all
-endif
-ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
- $(MAKE) -C cpp_thread_test all
-endif
-endif
-
-libs :
-ifeq ($(CORE), UNKNOWN)
- $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
-endif
-ifeq ($(NOFORTRAN), 1)
- $(info OpenBLAS: Detecting fortran compiler failed. Can only compile BLAS and f2c-converted LAPACK.)
-endif
-ifeq ($(NO_STATIC), 1)
-ifeq ($(NO_SHARED), 1)
- $(error OpenBLAS: neither static nor shared are enabled.)
-endif
-endif
- @for d in $(SUBDIRS) ; \
- do if test -d $$d; then \
- $(MAKE) -C $$d $(@F) || exit 1 ; \
- fi; \
- done
-#Save the config files for installation
- @cp Makefile.conf Makefile.conf_last
- @cp config.h config_last.h
-ifdef QUAD_PRECISION
- @echo "#define QUAD_PRECISION">> config_last.h
-endif
-ifeq ($(EXPRECISION), 1)
- @echo "#define EXPRECISION">> config_last.h
-endif
-##
-ifeq ($(DYNAMIC_ARCH), 1)
- @$(MAKE) -C kernel commonlibs || exit 1
- @for d in $(DYNAMIC_CORE) ; \
- do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
- done
- @echo DYNAMIC_ARCH=1 >> Makefile.conf_last
-ifeq ($(DYNAMIC_OLDER), 1)
- @echo DYNAMIC_OLDER=1 >> Makefile.conf_last
-endif
-endif
- @echo TARGET=$(CORE) >> Makefile.conf_last
-ifdef USE_THREAD
- @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
-endif
-ifdef SMP
-ifdef NUM_THREADS
- @echo NUM_THREADS=$(NUM_THREADS) >> Makefile.conf_last
-else
- @echo NUM_THREADS=$(NUM_CORES) >> Makefile.conf_last
-endif
-endif
-ifeq ($(USE_OPENMP),1)
- @echo USE_OPENMP=1 >> Makefile.conf_last
-endif
-ifeq ($(INTERFACE64),1)
- @echo INTERFACE64=1 >> Makefile.conf_last
-endif
- @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last
- @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last
- @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
- @touch lib.grd
-
-prof : prof_blas prof_lapack
-
-prof_blas :
- ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
- for d in $(SUBDIRS) ; \
- do if test -d $$d; then \
- $(MAKE) -C $$d prof || exit 1 ; \
- fi; \
- done
-ifeq ($(DYNAMIC_ARCH), 1)
- $(MAKE) -C kernel commonprof || exit 1
-endif
-
-blas :
- ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
- for d in $(BLASDIRS) ; \
- do if test -d $$d; then \
- $(MAKE) -C $$d libs || exit 1 ; \
- fi; \
- done
-
-hpl :
- ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
- for d in $(BLASDIRS) ../laswp exports ; \
- do if test -d $$d; then \
- $(MAKE) -C $$d $(@F) || exit 1 ; \
- fi; \
- done
-ifeq ($(DYNAMIC_ARCH), 1)
- $(MAKE) -C kernel commonlibs || exit 1
- for d in $(DYNAMIC_CORE) ; \
- do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
- done
-endif
-
-hpl_p :
- ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
- for d in $(SUBDIRS) ../laswp exports ; \
- do if test -d $$d; then \
- $(MAKE) -C $$d $(@F) || exit 1 ; \
- fi; \
- done
-
-netlib : lapack_prebuild
-ifneq ($(NO_LAPACK), 1)
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
-endif
-ifneq ($(NO_LAPACKE), 1)
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
-endif
-
-ifeq ($(NO_LAPACK), 1)
-re_lapack :
-
-else
-re_lapack :
- @$(MAKE) -C relapack
-endif
-
-prof_lapack : lapack_prebuild
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
-
-lapack_prebuild :
-ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK)))
- -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-ifeq ($(F_COMPILER), GFORTRAN)
- -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc
-else
- -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
- -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
- -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
-else
- -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
- -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-ifeq ($(F_COMPILER), GFORTRAN)
- -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc
-ifdef SMP
-ifeq ($(OSNAME), WINNT)
- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-else ifeq ($(OSNAME), Haiku)
- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-else
- -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
-else
- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
-else
- -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
-ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
- -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
-ifeq ($(BUILD_SINGLE), 1)
- -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
-ifeq ($(BUILD_DOUBLE), 1)
- -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
-ifeq ($(BUILD_COMPLEX), 1)
- -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
-ifeq ($(BUILD_COMPLEX16), 1)
- -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
- -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
- -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
-endif
-
-large.tgz :
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
- if [ ! -a $< ]; then
- -wget http://www.netlib.org/lapack/timing/large.tgz;
- fi
-endif
-
-timing.tgz :
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
- if [ ! -a $< ]; then
- -wget http://www.netlib.org/lapack/timing/timing.tgz;
- fi
-endif
-
-lapack-timing : large.tgz timing.tgz
-ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
- (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
- (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
- $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
-endif
-
-
-lapack-test :
- (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
-ifneq ($(CROSS), 1)
- ( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \
- ./testsecond; ./testdsecnd; ./testieee; ./testversion )
- (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
-endif
-
-lapack-runtest: lapack-test
- ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
- ./testsecond; ./testdsecnd; ./testieee; ./testversion )
- (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
-
-
-blas-test:
- (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
- $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
- (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
-
-
-dummy :
-
-install :
- $(MAKE) -f Makefile.install install
-
-clean ::
- @for d in $(SUBDIRS_ALL) ; \
- do if test -d $$d; then \
- $(MAKE) -C $$d $(@F) || exit 1 ; \
- fi; \
- done
-#ifdef DYNAMIC_ARCH
- @$(MAKE) -C kernel clean
-#endif
- @$(MAKE) -C reference clean
- @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0
-ifeq ($(OSNAME), Darwin)
- @rm -rf getarch.dSYM getarch_2nd.dSYM
-endif
- @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
- @rm -f cblas.tmp cblas.tmp2
- @touch $(NETLIB_LAPACK_DIR)/make.inc
- @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
- @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
- @$(MAKE) -C relapack clean
- @rm -f *.grd Makefile.conf_last config_last.h
- @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
- @echo Done.
diff --git a/c_check b/c_check
index b018c10..13a7086 100755
--- a/c_check
+++ b/c_check
@@ -84,6 +84,7 @@ case "$data" in
*ARCH_MIPS64*) architecture=mips64 ;;
*ARCH_MIPS*) architecture=mips ;;
*ARCH_ALPHA*) architecture=alpha ;;
+ *ARCH_SW_64*) architecture=sw_64 ;;
*ARCH_SPARC*) architecture=sparc ;;
*ARCH_IA64*) architecture=ia64 ;;
*ARCH_ARM64*) architecture=arm64 ;;
@@ -124,7 +125,7 @@ case "$architecture" in
defined=1
;;
arm|arm64) defined=1 ;;
- zarch|e2k|alpha|ia64|riscv64|loonarch64)
+ zarch|e2k|alpha|ia64|riscv64|loonarch64|sw_64)
defined=1
BINARY=64
;;
@@ -232,6 +233,7 @@ case "$data" in
*ARCH_MIPS64*) architecture=mips64 ;;
*ARCH_MIPS*) architecture=mips ;;
*ARCH_ALPHA*) architecture=alpha ;;
+ *ARCH_SW_64*) architecture=sw_64 ;;
*ARCH_SPARC*) architecture=sparc ;;
*ARCH_IA64*) architecture=ia64 ;;
*ARCH_ARM64*) architecture=arm64 ;;
diff --git a/common.h b/common.h
index 4074df0..309c3f9 100644
--- a/common.h
+++ b/common.h
@@ -420,6 +420,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_alpha.h"
#endif
+#ifdef ARCH_SW_64
+#include "common_sw_64.h"
+#endif
+
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
#if __has_include(<cet.h>)
#include <cet.h>
diff --git a/common_sw_64.h b/common_sw_64.h
new file mode 100644
index 0000000..e14268e
--- /dev/null
+++ b/common_sw_64.h
@@ -0,0 +1,200 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#ifndef COMMON_SW_64
+#define COMMON_SW_64
+
+#ifndef ASSEMBLER
+
+#define MB asm("memb")
+#define WMB asm("memb")
+#define RMB asm("memb")
+
+static void __inline blas_lock(unsigned long *address){
+#ifndef __DECC
+ unsigned long tmp1, tmp2,tmp3;
+ asm volatile(
+ "1: ldl %1, %0\n"
+ " bne %1, 2f\n"
+ " ldi %3, %0 \n"
+ " lldl %1, 0(%3)\n"
+ " ldi %2, 1 \n"
+ " wr_f %2 \n"
+ " or %1, 1, %2\n"
+ " memb\n "
+ " lstl %2, 0(%3)\n"
+ " rd_f %2\n"
+ " bne %1, 2f\n"
+ " beq %2, 2f\n"
+ " memb\n "
+ " br $31, 3f\n"
+ "2: br $31, 1b\n"
+ "3:\n" : "=m"(*address), "=&r"(tmp1), "=&r"(tmp2),"=&r"(tmp3) : : "memory");
+#else
+ asm (
+ "10:"
+ " ldl %t0, 0(%a0); "
+ " bne %t0, 20f; "
+ " ldi %t2, %a0"
+ " lldl %t0, 0(%t2); "
+ " ldi %t1, 1"
+ " wr_f %t1"
+ " or %t0, 1, %t1;"
+ " memb; "
+ " lstl %t1, 0(%t2); "
+ " rd_f %t1"
+ " bne %t0, 20f; "
+ " beq %t1, 20f; "
+ " memb; "
+ " br %r31,30f; "
+ "20: "
+ " br %r31,10b; "
+ "30:", address);
+#endif
+}
+#define BLAS_LOCK_DEFINED
+
+static __inline unsigned int rpcc(void){
+
+ unsigned int r0;
+
+#ifndef __DECC
+ asm __volatile__("rtc %0" : "=r"(r0) : : "memory");
+#else
+ r0 = asm("rtc %v0");
+#endif
+
+ return r0;
+}
+#define RPCC_DEFINED
+
+
+#define HALT ldl $0, 0($0)
+
+#ifndef __DECC
+#define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory")
+#else
+#define GET_IMAGE(res) res = dasm("fmov $f1, %f0")
+#endif
+
+#ifdef SMP
+#ifdef USE64BITINT
+static __inline long blas_quickdivide(long x, long y){
+ return x/y;
+}
+#else
+extern unsigned int blas_quick_divide_table[];
+
+static __inline int blas_quickdivide(unsigned int x, unsigned int y){
+ if (y <= 1) return x;
+ return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32);
+}
+#endif
+#endif
+
+#define BASE_ADDRESS ((0x1b0UL << 33) | (0x1c0UL << 23) | (0x000UL << 13))
+
+#ifndef PAGESIZE
+#define PAGESIZE ( 8UL << 10)
+#define HUGE_PAGESIZE ( 4 << 20)
+#endif
+#define BUFFER_SIZE (32UL << 20)
+
+#else
+
+#ifndef F_INTERFACE
+#define REALNAME ASMNAME
+#else
+#define REALNAME ASMFNAME
+#endif
+
+#define PROLOGUE \
+ .arch sw6; \
+ .set noat; \
+ .set noreorder; \
+.text; \
+ .align 5; \
+ .globl REALNAME; \
+ .ent REALNAME; \
+REALNAME:
+
+#ifdef PROFILE
+#define PROFCODE \
+ ldgp $gp, 0($27); \
+ ldi $28, _mcount; \
+ jsr $28, ($28), _mcount; \
+ .prologue 1
+#else
+#define PROFCODE .prologue 0
+#endif
+
+#if defined(__linux__) && defined(__ELF__)
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
+#else
+#define GNUSTACK
+#endif
+
+#define EPILOGUE \
+ .end REALNAME; \
+ .ident VERSION; \
+ GNUSTACK
+
+#endif
+
+#ifdef DOUBLE
+#define SXADDQ s8addl
+#define SXSUBL s8subl
+#define LD fldd
+#define ST fstd
+#define STQ stq
+#define ADD faddd
+#define SUB fsubd
+#define MUL fmuld
+#define DIV fdivd
+#else
+#define SXADDQ s4addl
+#define SXSUBL s4subl
+#define LD flds
+#define ST fsts
+#define STQ stl
+#define ADD fadds
+#define SUB fsubs
+#define MUL fmuls
+#define DIV fdivs
+#endif
+#endif
diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile
index be8313e..1ab9bb8 100644
--- a/cpp_thread_test/Makefile
+++ b/cpp_thread_test/Makefile
@@ -1,14 +1,13 @@
-TOPDIR = ..
-include $(TOPDIR)/Makefile.system
+include ../Makefile.rule
all :: dgemv_tester dgemm_tester
dgemv_tester :
- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
+ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
- $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
+ $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
./dgemm_tester
clean ::
diff --git a/cpuid_sw_64.c b/cpuid_sw_64.c
new file mode 100644
index 0000000..61ed28a
--- /dev/null
+++ b/cpuid_sw_64.c
@@ -0,0 +1,105 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#if defined(__sw_64__) && defined(__DECC)
+#include <c_asm.h>
+#endif
+
+int implver(void){
+ int arch;
+
+#ifndef __DECC
+ asm __volatile__("implver %0" : "=r"(arch) : : "memory");
+#else
+ arch = asm("implver %v0");
+#endif
+ return arch;
+}
+
+void get_architecture(void){
+ printf("SW_64");
+}
+
+void get_subarchitecture(void){
+ printf("sw%d", implver() + 4);
+}
+
+void get_subdirname(void){
+ printf("sw_64");
+}
+
+char *get_corename(void){
+ return "sw_64";
+}
+
+void get_cpuconfig(void){
+ printf("#define SW%d\n", implver() + 4);
+
+ switch (implver()){
+ case 0:
+ printf("#define L1_DATA_SIZE 16384\n");
+ printf("#define L1_DATA_LINESIZE 32\n");
+ printf("#define L2_SIZE 2097152\n");
+ printf("#define L2_LINESIZE 32\n");
+ printf("#define DTB_DEFAULT_ENTRIES 32\n");
+ printf("#define DTB_SIZE 8192\n");
+ break;
+
+ case 1:
+ printf("#define L1_DATA_SIZE 16384\n");
+ printf("#define L1_DATA_LINESIZE 32\n");
+ printf("#define L2_SIZE 2097152\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 8192\n");
+ break;
+
+ case 2:
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L2_SIZE 4194304\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 8192\n");
+ break;
+ }
+}
+
+void get_libname(void){
+ printf("sw%d\n", implver() + 4);
+}
diff --git a/ctest.c b/ctest.c
index 2ccae8d..6b21d3a 100644
--- a/ctest.c
+++ b/ctest.c
@@ -137,6 +137,10 @@ ARCH_MIPS
ARCH_ALPHA
#endif
+#ifdef __sw_64__
+ARCH_SW_64
+#endif
+
#if defined(__sparc) || defined(__sparc__)
ARCH_SPARC
#endif
diff --git a/getarch.c b/getarch.c
index 87384c0..306c389 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1766,6 +1766,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED
#endif
+#ifdef __sw_64__
+#include "cpuid_sw_64.c"
+#define OPENBLAS_SUPPORTED
+#endif
#ifndef OPENBLAS_SUPPORTED
#error "This arch/CPU is not supported by OpenBLAS."
@@ -1831,7 +1835,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__sw_64__)
printf("CORE=%s\n", get_corename());
#endif
#endif
@@ -1979,7 +1983,7 @@ printf("ELF_VERSION=2\n");
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__sw_64__)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif
diff --git a/interface/gbmv.c b/interface/gbmv.c
index 1d58ba8..18aa50e 100644
--- a/interface/gbmv.c
+++ b/interface/gbmv.c
@@ -236,7 +236,12 @@ void CNAME(enum CBLAS_ORDER order,
#ifdef SMP
} else {
-
+//ZYX20220118
+#ifndef TRANSA
+ memset(buffer, 0, nthreads*m*sizeof(FLOAT));
+#else
+ memset(buffer, 0, nthreads*n*sizeof(FLOAT));
+#endif
(gbmv_thread[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer, nthreads);
}
diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1
index 0933736..111924b 100644
--- a/kernel/Makefile.L1
+++ b/kernel/Makefile.L1
@@ -398,12 +398,16 @@ ifndef DSWAPKERNEL
DSWAPKERNEL = swap.S
endif
+#ZYX20220301
ifndef CSWAPKERNEL
-CSWAPKERNEL = zswap.S
+CSWAPKERNEL = zswap.c
+#CSWAPKERNEL = zswap.S
endif
+#ZYX20220301
ifndef ZSWAPKERNEL
-ZSWAPKERNEL = zswap.S
+ZSWAPKERNEL = zswap.c
+#ZSWAPKERNEL = zswap.S
endif
ifndef QSWAPKERNEL
diff --git a/kernel/sw_64/KERNEL b/kernel/sw_64/KERNEL
new file mode 100644
index 0000000..d10504b
--- /dev/null
+++ b/kernel/sw_64/KERNEL
@@ -0,0 +1,176 @@
+ifndef SAMINKERNEL
+SAMINKERNEL = amax.S
+endif
+
+ifndef DAMINKERNEL
+DAMINKERNEL = amax.S
+endif
+
+ifndef CAMINKERNEL
+CAMINKERNEL = zamax.S
+endif
+
+ifndef ZAMINKERNEL
+ZAMINKERNEL = zamax.S
+endif
+
+ifndef SMINKERNEL
+SMINKERNEL = max.S
+endif
+
+ifndef DMINKERNEL
+DMINKERNEL = max.S
+endif
+
+ifndef ISAMINKERNEL
+ISAMINKERNEL = iamax.S
+endif
+
+ifndef IDAMINKERNEL
+IDAMINKERNEL = iamax.S
+endif
+
+ifndef ICAMINKERNEL
+ICAMINKERNEL = izamax.S
+endif
+
+ifndef IZAMINKERNEL
+IZAMINKERNEL = izamax.S
+endif
+
+#ZYX20220301
+ifndef LSAME_KERNEL
+LSAME_KERNEL = ../generic/lsame.c
+endif
+
+#ZYX20220120
+ifndef ISMINKERNEL
+ISMINKERNEL = amax.S
+#ISMINKERNEL = imin.c
+endif
+
+#ZYX20220120
+#ifndef ISMAXKERNEL
+#ISMAXKERNEL = imax.c
+#endif
+
+ifndef IDMINKERNEL
+IDMINKERNEL = amax.S
+endif
+
+ifndef CCOPYKERNEL
+CCOPYKERNEL = copy.S
+endif
+
+ifndef ZCOPYKERNEL
+ZCOPYKERNEL = copy.S
+endif
+
+ifndef SNRM2KERNEL
+SNRM2KERNEL = snrm2.S
+endif
+
+ifndef DNRM2KERNEL
+DNRM2KERNEL = dnrm2.S
+endif
+
+ifndef CNRM2KERNEL
+CNRM2KERNEL = cnrm2.S
+endif
+
+ifndef ZNRM2KERNEL
+ZNRM2KERNEL = znrm2.S
+endif
+
+ifndef SGEMMKERNEL
+SGEMMKERNEL = gemm_kernel_4x4.S
+SGEMM_BETA = gemm_beta.S
+SGEMMONCOPY = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX)
+endif
+
+ifndef DGEMMKERNEL
+DGEMMKERNEL = gemm_kernel_4x4.S
+DGEMM_BETA = gemm_beta.S
+DGEMMONCOPY = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX)
+endif
+
+ifndef CGEMMKERNEL
+CGEMMKERNEL = zgemm_kernel_2x2.S
+CGEMM_BETA = zgemm_beta.S
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX)
+endif
+
+ifndef ZGEMMKERNEL
+ZGEMMKERNEL = zgemm_kernel_2x2.S
+ZGEMM_BETA = zgemm_beta.S
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX)
+endif
+
+SGEMM_BETA = gemm_beta.S
+DGEMM_BETA = gemm_beta.S
+CGEMM_BETA = zgemm_beta.S
+ZGEMM_BETA = zgemm_beta.S
+
+ifndef STRSMKERNEL_LN
+STRSMKERNEL_LN = trsm_kernel_4x4_LN.S
+endif
+ifndef STRSMKERNEL_LT
+STRSMKERNEL_LT = trsm_kernel_4x4_LT.S
+endif
+ifndef STRSMKERNEL_RN
+STRSMKERNEL_RN = trsm_kernel_4x4_LT.S
+endif
+ifndef STRSMKERNEL_RT
+STRSMKERNEL_RT = trsm_kernel_4x4_RT.S
+endif
+
+ifndef DTRSMKERNEL_LN
+DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S
+endif
+ifndef DTRSMKERNEL_LT
+DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S
+endif
+ifndef DTRSMKERNEL_RN
+DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S
+endif
+ifndef DTRSMKERNEL_RT
+DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S
+endif
+
+ifndef CTRSMKERNEL_LN
+CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S
+endif
+ifndef CTRSMKERNEL_LT
+CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S
+endif
+ifndef CTRSMKERNEL_RN
+CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S
+endif
+ifndef CTRSMKERNEL_RT
+CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S
+endif
+
+ifndef ZTRSMKERNEL_LN
+ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S
+endif
+ifndef ZTRSMKERNEL_LT
+ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S
+endif
+ifndef ZTRSMKERNEL_RN
+ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S
+endif
+ifndef ZTRSMKERNEL_RT
+ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S
+endif
diff --git a/kernel/sw_64/Makefile b/kernel/sw_64/Makefile
new file mode 100644
index 0000000..efae70d
--- /dev/null
+++ b/kernel/sw_64/Makefile
@@ -0,0 +1,2 @@
+clean ::
+
diff --git a/kernel/sw_64/amax.S b/kernel/sw_64/amax.S
new file mode 100644
index 0000000..300a2f7
--- /dev/null
+++ b/kernel/sw_64/amax.S
@@ -0,0 +1,283 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#endif
+
+#define STACKSIZE 6 * 8
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+ nop
+ .align 4
+
+ fstd $f2, 0($sp)
+ fclr $f16
+ cmplt $31, N, $2
+ unop
+
+ fstd $f3, 8($sp)
+ fclr $f17
+ cmplt $31, INCX, $3
+ unop
+
+ fstd $f4, 16($sp)
+ fclr $f18
+ SXADDQ INCX, $31, INCX
+ unop
+
+ fstd $f5, 24($sp)
+ fclr $f19
+ and $2, $3, $0
+ unop
+
+ fstd $f6, 32($sp)
+ fclr $f0
+ sra N, 3, $1
+ beq $0, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ unop
+ fabs $f20, $f0
+ ble $1, $L15
+ .align 4
+
+ fabs $f20, $f1
+ unop
+ addl X, INCX, X
+ unop
+
+ LD $f21, 0 * SIZE(X)
+ fabs $f20, $f2
+ addl X, INCX, X
+ unop
+
+ LD $f22, 0 * SIZE(X)
+ fabs $f20, $f3
+ addl X, INCX, X
+ unop
+
+ LD $f23, 0 * SIZE(X)
+ fabs $f20, $f4
+ addl X, INCX, X
+ unop
+
+ LD $f24, 0 * SIZE(X)
+ addl X, INCX, X
+ fabs $f20, $f5
+ unop
+
+ LD $f25, 0 * SIZE(X)
+ fabs $f20, $f6
+ addl X, INCX, X
+ unop
+
+ LD $f26, 0 * SIZE(X)
+ fabs $f20, $f28
+ addl X, INCX, X
+ ldi $1, -1($1)
+
+ LD $f27, 0 * SIZE(X)
+ unop
+ addl X, INCX, X
+ ble $1, $L13
+ .align 4
+
+$L12:
+ fselne $f16, $f12, $f4, $f4
+ unop
+ fabs $f20, $f29
+ fillcs 56 * SIZE(X)
+
+ fselne $f17, $f13, $f5, $f5
+ LD $f20, 0 * SIZE(X)
+ fabs $f21, $f30
+ addl X, INCX, X
+
+ fselne $f18, $f14, $f6, $f6
+ LD $f21, 0 * SIZE(X)
+ fabs $f22, $f10
+ addl X, INCX, X
+
+ fselne $f19, $f15, $f28, $f28
+ LD $f22, 0 * SIZE(X)
+ fabs $f23, $f11
+ addl X, INCX, X
+
+ fabs $f24, $f12
+ LD $f23, 0 * SIZE(X)
+ CMPLT($f0, $f29), $f16
+ addl X, INCX, X
+
+ fabs $f25, $f13
+ LD $f24, 0 * SIZE(X)
+ CMPLT($f1, $f30), $f17
+ addl X, INCX, X
+
+ fabs $f26, $f14
+ LD $f25, 0 * SIZE(X)
+ CMPLT($f2, $f10), $f18
+ addl X, INCX, X
+
+ fabs $f27, $f15
+ LD $f26, 0 * SIZE(X)
+ CMPLT($f3, $f11), $f19
+ addl X, INCX, X
+
+ fselne $f16, $f29, $f0, $f0
+ LD $f27, 0 * SIZE(X)
+ CMPLT($f4, $f12), $f16
+ addl X, INCX, X
+
+ fselne $f17, $f30, $f1, $f1
+ unop
+ CMPLT($f5, $f13), $f17
+ ldi $1, -1($1) # i --
+
+ fselne $f18, $f10, $f2, $f2
+ unop
+ CMPLT($f6, $f14), $f18
+ unop
+
+ fselne $f19, $f11, $f3, $f3
+ unop
+ CMPLT($f28, $f15), $f19
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ fselne $f16, $f12, $f4, $f4
+ fabs $f20, $f29
+ fselne $f17, $f13, $f5, $f5
+ fabs $f21, $f30
+
+ fselne $f18, $f14, $f6, $f6
+ fabs $f22, $f10
+ fselne $f19, $f15, $f28, $f28
+ fabs $f23, $f11
+
+ fabs $f24, $f12
+ CMPLT($f0, $f29), $f16
+ fabs $f25, $f13
+ CMPLT($f1, $f30), $f17
+
+ fabs $f26, $f14
+ CMPLT($f2, $f10), $f18
+ fabs $f27, $f15
+ CMPLT($f3, $f11), $f19
+
+ fselne $f16, $f29, $f0, $f0
+ CMPLT($f4, $f12), $f16
+ fselne $f17, $f30, $f1, $f1
+ CMPLT($f5, $f13), $f17
+
+ fselne $f18, $f10, $f2, $f2
+ CMPLT($f6, $f14), $f18
+ fselne $f19, $f11, $f3, $f3
+ CMPLT($f28, $f15), $f19
+
+ fselne $f16, $f12, $f4, $f4
+ CMPLT($f0, $f1), $f16
+ fselne $f17, $f13, $f5, $f5
+ CMPLT($f2, $f3), $f17
+
+ fselne $f18, $f14, $f6, $f6
+ CMPLT($f4, $f5), $f18
+ fselne $f19, $f15, $f28, $f28
+ CMPLT($f6, $f28), $f19
+
+ fselne $f16, $f1, $f0, $f0
+ fselne $f17, $f3, $f2, $f2
+ fselne $f18, $f5, $f4, $f4
+ fselne $f19, $f28, $f6, $f6
+
+ CMPLT($f0, $f2), $f16
+ CMPLT($f4, $f6), $f17
+
+ fselne $f16, $f2, $f0, $f0
+ fselne $f17, $f6, $f4, $f0
+
+ CMPLT($f0, $f4), $f16
+ fselne $f16, $f4, $f0, $f0
+ .align 4
+
+$L15:
+ and N, 7, $1
+ unop
+ unop
+ ble $1, $End
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ CMPLT($f0, $f29), $f16
+ fselne $f16, $f29, $f0, $f0
+
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/asum.S b/kernel/sw_64/asum.S
new file mode 100644
index 0000000..54e7fcb
--- /dev/null
+++ b/kernel/sw_64/asum.S
@@ -0,0 +1,230 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define I $19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+
+ PROLOGUE
+ PROFCODE
+
+ fclr s0
+ unop
+ fclr t0
+ ble N, $L999
+
+ sra N, 3, I
+ fclr s1
+ fclr s2
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ fclr t1
+ SXADDQ INCX, X, X
+ fclr t2
+
+ LD a1, 0 * SIZE(X)
+ fclr t3
+ SXADDQ INCX, X, X
+ fclr s3
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a5, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ ldw $31, PREFETCHSIZE * 2 * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a6, 0 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2,$f24
+ fmov $f24,s2
+ LD a7, 0 * SIZE(X)
+ fabs a2, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3,$f24
+ fmov $f24,s3
+ LD a0, 0 * SIZE(X)
+ fabs a3, t3
+ SXADDQ INCX, X, X
+
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ LD a1, 0 * SIZE(X)
+ fabs a4, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a2, 0 * SIZE(X)
+ fabs a5, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2,$f24
+ fmov $f24,s2
+ LD a3, 0 * SIZE(X)
+ fabs a6, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ LD a4, 0 * SIZE(X)
+ fabs a7, t3
+ SXADDQ INCX, X, X
+
+ LD a5, 0 * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0,$f24
+ fmov $f24,s0
+ LD a6, 0 * SIZE(X)
+ fabs a0, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1,$f24
+ fmov $f24,s1
+ LD a7, 0 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ fabs a2, t2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ fabs a3, t3
+
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ fabs a4, t0
+ ADD s1, t1,$f24
+ fmov $f24,s1
+ fabs a5, t1
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ fabs a6, t2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ fabs a7, t3
+
+ ADD s1, t1,$f24
+ fmov $f24,s1
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+
+ ADD s0, s1, $f24
+ fmov $f24,s0
+ ADD s2, s3, $f24
+ fmov $f24,s2
+ .align 4
+
+$L15:
+ and N, 7, I
+ ADD s0, s2,$f24
+ fmov $f24,s0
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ ADD s0, t0, a0
+ fmov a0,s0
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ fabs a0, t0
+
+ ldi I, -1(I)
+ bne I, $L17
+ .align 4
+
+$L999:
+ ADD s0, t0,$f24
+ fmov $f24,s0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/asum.S.bak b/kernel/sw_64/asum.S.bak
new file mode 100644
index 0000000..faf7827
--- /dev/null
+++ b/kernel/sw_64/asum.S.bak
@@ -0,0 +1,206 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define I $19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+
+ PROLOGUE
+ PROFCODE
+
+ fclr s0
+ unop
+ fclr t0
+ ble N, $L999
+
+ sra N, 3, I
+ fclr s1
+ fclr s2
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ fclr t1
+ SXADDQ INCX, X, X
+ fclr t2
+
+ LD a1, 0 * SIZE(X)
+ fclr t3
+ SXADDQ INCX, X, X
+ fclr s3
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a5, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, s0
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, s1
+ LD a6, 0 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ LD a7, 0 * SIZE(X)
+ fabs a2, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ fabs a3, t3
+ SXADDQ INCX, X, X
+
+ ADD s0, t0, s0
+ LD a1, 0 * SIZE(X)
+ fabs a4, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, s1
+ LD a2, 0 * SIZE(X)
+ fabs a5, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ LD a3, 0 * SIZE(X)
+ fabs a6, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ fabs a7, t3
+ SXADDQ INCX, X, X
+
+ LD a5, 0 * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, s0
+ LD a6, 0 * SIZE(X)
+ fabs a0, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, s1
+ LD a7, 0 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ fabs a2, t2
+ ADD s3, t3, s3
+ fabs a3, t3
+
+ ADD s0, t0, s0
+ fabs a4, t0
+ ADD s1, t1, s1
+ fabs a5, t1
+ ADD s2, t2, s2
+ fabs a6, t2
+ ADD s3, t3, s3
+ fabs a7, t3
+
+ ADD s1, t1, s1
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+ ADD s0, s1, s0
+ ADD s2, s3, s2
+ .align 4
+
+$L15:
+ and N, 7, I
+ ADD s0, s2, s0
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ ADD s0, t0, s0
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ fabs a0, t0
+
+ ldi I, -1(I)
+ bne I, $L17
+ .align 4
+
+$L999:
+ ADD s0, t0, s0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/asum_simd.S b/kernel/sw_64/asum_simd.S
new file mode 100644
index 0000000..f9152ec
--- /dev/null
+++ b/kernel/sw_64/asum_simd.S
@@ -0,0 +1,342 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define I $19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+
+ PROLOGUE
+ PROFCODE
+
+ fclr s0
+ unop
+ fclr t0
+ ble N, $L999
+
+ cmpeq INCX, 1, $3
+ beq $3, $Sub
+ .align 4
+
+/*
+ Unloop 16
+*/
+
+/**
+ test the address of X
+**/
+ and X, (VEC_LEN*SIZE-1), $4
+ nop
+ nop
+ beq $4, $Align
+
+/**
+ process the unalign address of X
+**/
+
+/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/
+ sra N, 4, I
+ fclr s1
+ fclr s2
+ ble I, $Remain
+
+ sra $4, BASE_SHIFT, $4
+ ldi $3, VEC_LEN
+ subl $3, $4, $4
+ nop
+
+$UnAlign_X_Loop:
+ LD a0, 0 * SIZE(X)
+ addl X, SIZE, X
+ fabs a0, t0
+ subl $4, 1, $4
+
+ ADD s0, t0, s0
+ subl N, 1, N
+ nop
+ bgt $4, $UnAlign_X_Loop
+
+$Align:
+ sra N, 4, I
+ fclr s1
+ fclr s2
+ ble I, $Remain
+
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t0
+ VLD a1, 1*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t1
+
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t2
+ VLD a3, 3*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t3
+
+ subl I, 1, I
+ addl X, 16*SIZE, X
+ unop
+ ble I, $MainLoopEnd
+
+$MainLoop:
+
+ vcpys $f31, a0, a4
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, a1, a5
+ VLD a1, 1*VEC_LEN*SIZE(X)
+
+ vcpys $f31, a2, a6
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ vcpys $f31, a3, a7
+ VLD a3, 3*VEC_LEN*SIZE(X)
+
+ VADD t0, a4, t0
+ subl I, 1, I
+ VADD t1, a5, t1
+ fillcs PREFETCHSIZE * SIZE(X)
+
+ VADD t2, a6, t2
+ addl X, 16*SIZE, X
+ VADD t3, a7, t3
+ bgt I, $MainLoop
+
+$MainLoopEnd:
+ /*fabs*/
+
+ vcpys $f31, a0, a4
+ vcpys $f31, a1, a5
+ vcpys $f31, a2, a6
+ vcpys $f31, a3, a7
+
+ VADD t0, a4, t0
+ VADD t1, a5, t1
+ VADD t2, a6, t2
+ VADD t3, a7, t3
+
+ VADD t0, t1, t0
+ VADD t2, t3, t2
+ VADD t0, t2, t0
+ nop
+
+ vextf t0, 1, s1
+ vextf t0, 2, s2
+ vextf t0, 3, s3
+ nop
+
+ /*sum*/
+ ADD t0, s1, t0
+ ADD s2, s3, s2
+ ADD s0, t0, s0
+ nop
+$Remain:
+ and N, 15, I
+ ADD s0, s2, s0
+ unop
+ ble I, $End
+ .align 4
+
+$RemainLoop:
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s0, t0, s0
+ bne I, $RemainLoop
+ .align 4
+
+$End:
+ ret
+
+
+$Sub:
+ sra N, 3, I
+ fclr s1
+ fclr s2
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ fclr t1
+ SXADDQ INCX, X, X
+ fclr t2
+
+ LD a1, 0 * SIZE(X)
+ fclr t3
+ SXADDQ INCX, X, X
+ fclr s3
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a5, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, s0
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, s1
+ LD a6, 0 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ LD a7, 0 * SIZE(X)
+ fabs a2, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ fabs a3, t3
+ SXADDQ INCX, X, X
+
+ ADD s0, t0, s0
+ LD a1, 0 * SIZE(X)
+ fabs a4, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, s1
+ LD a2, 0 * SIZE(X)
+ fabs a5, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ LD a3, 0 * SIZE(X)
+ fabs a6, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ fabs a7, t3
+ SXADDQ INCX, X, X
+
+ LD a5, 0 * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, s0
+ LD a6, 0 * SIZE(X)
+ fabs a0, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, s1
+ LD a7, 0 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ fabs a2, t2
+ ADD s3, t3, s3
+ fabs a3, t3
+
+ ADD s0, t0, s0
+ fabs a4, t0
+ ADD s1, t1, s1
+ fabs a5, t1
+ ADD s2, t2, s2
+ fabs a6, t2
+ ADD s3, t3, s3
+ fabs a7, t3
+
+ ADD s1, t1, s1
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+ ADD s0, s1, s0
+ ADD s2, s3, s2
+ .align 4
+
+$L15:
+ and N, 7, I
+ ADD s0, s2, s0
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ ADD s0, t0, s0
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ fabs a0, t0
+
+ ldi I, -1(I)
+ bne I, $L17
+ .align 4
+
+$L999:
+ ADD s0, t0, s0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/axpy.S b/kernel/sw_64/axpy.S
new file mode 100644
index 0000000..70e97d6
--- /dev/null
+++ b/kernel/sw_64/axpy.S
@@ -0,0 +1,428 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 40
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ ldl $24, 0($sp)
+ fmov $f19, $f30
+ ldl $23, 8($sp)
+ ldi $sp, -16($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ nop
+ sra $16, 3, $1
+ fstd $f2, 0($sp)
+ cmpeq $21, 1, $3
+
+ fstd $f3, 8($sp)
+ cmpeq $23, 1, $4
+ and $16, 7, $2
+ ble $16, $End
+
+ and $3, $4, $3
+ fbeq $f30, $End
+
+ beq $3, $Sub
+ ble $1, $Remain
+ .align 4
+
+ LD $f10, 0*SIZE($20)
+ LD $f11, 1*SIZE($20)
+ LD $f12, 2*SIZE($20)
+ LD $f13, 3*SIZE($20)
+
+ LD $f18, 0*SIZE($24)
+ LD $f19, 1*SIZE($24)
+ LD $f20, 2*SIZE($24)
+ LD $f21, 3*SIZE($24)
+
+ LD $f14, 4*SIZE($20)
+ LD $f15, 5*SIZE($20)
+ LD $f16, 6*SIZE($20)
+ LD $f17, 7*SIZE($20)
+
+ LD $f22, 4*SIZE($24)
+ LD $f23, 5*SIZE($24)
+ LD $f24, 6*SIZE($24)
+ LD $f25, 7*SIZE($24)
+
+ subl $1, 1, $1
+ addl $20, 8*SIZE, $20
+ unop
+ ble $1, $LoopEnd
+ .align 4
+
+$Loop:
+ fillcs PREFETCHSIZE * SIZE($24)
+ fillcs PREFETCHSIZE * SIZE($20)
+
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
+ LD $f10, 0*SIZE($20)
+ MUL $f30, $f11, $f27
+ LD $f11, 1*SIZE($20)
+
+ MUL $f30, $f12, $f28
+ LD $f12, 2*SIZE($20)
+ MUL $f30, $f13, $f29
+ LD $f13, 3*SIZE($20)
+
+ ADD $f18, $f26, $f0
+ LD $f18, 8*SIZE($24)
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
+ LD $f14, 4*SIZE($20)
+
+ ADD $f19, $f27, $f1
+ LD $f19, 9*SIZE($24)
+ MUL $f30, $f15, $f27
+ LD $f15, 5*SIZE($20)
+
+ ADD $f20, $f28, $f2
+ LD $f20, 10*SIZE($24)
+ MUL $f30, $f16, $f28
+ LD $f16, 6*SIZE($20)
+
+ ADD $f21, $f29, $f3
+ LD $f21, 11*SIZE($24)
+ MUL $f30, $f17, $f29
+ LD $f17, 7*SIZE($20)
+
+ ST $f0, 0*SIZE($24)
+ ADD $f22, $f26, $f0
+ ST $f1, 1*SIZE($24)
+ ADD $f23, $f27, $f1
+
+ ST $f2, 2*SIZE($24)
+ ADD $f24, $f28, $f2
+ ST $f3, 3*SIZE($24)
+ ADD $f25, $f29, $f3
+
+ LD $f22, 12*SIZE($24)
+ LD $f23, 13*SIZE($24)
+ LD $f24, 14*SIZE($24)
+ LD $f25, 15*SIZE($24)
+
+ ST $f0, 4*SIZE($24)
+ ST $f1, 5*SIZE($24)
+ ST $f2, 6*SIZE($24)
+ ST $f3, 7*SIZE($24)
+
+ subl $1, 1, $1
+ addl $24, 8*SIZE, $24
+ addl $20, 8*SIZE, $20
+ bgt $1, $Loop
+ .align 4
+
+$LoopEnd:
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
+ MUL $f30, $f11, $f27
+ MUL $f30, $f12, $f28
+ MUL $f30, $f13, $f29
+
+ ADD $f18, $f26, $f0
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
+ ADD $f19, $f27, $f1
+ MUL $f30, $f15, $f27
+
+ ADD $f20, $f28, $f2
+ MUL $f30, $f16, $f28
+ ADD $f21, $f29, $f3
+ MUL $f30, $f17, $f29
+
+ ST $f0, 0*SIZE($24)
+ ADD $f22, $f26, $f0
+ ST $f1, 1*SIZE($24)
+ ADD $f23, $f27, $f1
+
+ ST $f2, 2*SIZE($24)
+ ADD $f24, $f28, $f2
+ ST $f3, 3*SIZE($24)
+ ADD $f25, $f29, $f3
+
+ ST $f0, 4*SIZE($24)
+ ST $f1, 5*SIZE($24)
+ ST $f2, 6*SIZE($24)
+ ST $f3, 7*SIZE($24)
+ addl $24, 8*SIZE, $24
+ .align 4
+
+$Remain:
+ ble $2, $End
+ .align 4
+
+$RemainLoop:
+ LD $f10, 0*SIZE($20)
+ LD $f11, 0*SIZE($24)
+ addl $20, SIZE, $20
+ addl $24, SIZE, $24
+
+ MUL $f30, $f10, $f12
+ subl $2, 1, $2
+ ADD $f11, $f12, $f13
+ ST $f13, -1*SIZE($24)
+ bgt $2, $RemainLoop
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ ldi $sp, 16($sp)
+ ret
+ .align 4
+
+$Sub:
+ SXSUBL $16, SIZE, $22
+ subl $1, 1, $4
+ ble $1, $SubRemain
+ .align 4
+
+ LD $f10, 0($20)
+ SXADDQ $21, $20, $20
+
+ LD $f11, 0($20)
+ SXADDQ $21, $20, $20
+ LD $f12, 0($20)
+ SXADDQ $21, $20, $20
+
+ LD $f13, 0($20)
+ SXADDQ $21, $20, $20
+ LD $f18, 0($24)
+ SXADDQ $23, $24, $22
+
+ LD $f19, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f20, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f21, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f14, 0($20)
+ SXADDQ $21, $20, $20
+
+ LD $f15, 0($20)
+ SXADDQ $21, $20, $20
+ LD $f16, 0($20)
+ SXADDQ $21, $20, $20
+
+ LD $f17, 0($20)
+ SXADDQ $21, $20, $20
+ LD $f22, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f23, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f24, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f25, 0($22)
+ SXADDQ $23, $22, $22
+ unop
+ ble $4, $SubLoopEnd
+ .align 4
+
+$SubLoop:
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
+ LD $f10, 0($20)
+ unop
+ SXADDQ $21, $20, $20
+
+ MUL $f30, $f11, $f27
+ LD $f11, 0($20)
+ unop
+ SXADDQ $21, $20, $20
+
+ MUL $f30, $f12, $f28
+ LD $f12, 0($20)
+ unop
+ SXADDQ $21, $20, $20
+
+ MUL $f30, $f13, $f29
+ LD $f13, 0($20)
+ unop
+ SXADDQ $21, $20, $20
+
+ ADD $f18, $f26, $f0
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
+ LD $f14, 0($20)
+ SXADDQ $21, $20, $20
+
+ ADD $f19, $f27, $f1
+ MUL $f30, $f15, $f27
+ LD $f15, 0($20)
+ SXADDQ $21, $20, $20
+
+ ADD $f20, $f28, $f2
+ MUL $f30, $f16, $f28
+ LD $f16, 0($20)
+ SXADDQ $21, $20, $20
+
+ ADD $f21, $f29, $f3
+ MUL $f30, $f17, $f29
+ LD $f17, 0($20)
+ SXADDQ $21, $20, $20
+
+ ST $f0, 0($24)
+ SXADDQ $23, $24, $24
+ ADD $f22, $f26, $f0
+ unop
+
+ ST $f1, 0($24)
+ SXADDQ $23, $24, $24
+ ADD $f23, $f27, $f1
+ unop
+
+ ST $f2, 0($24)
+ SXADDQ $23, $24, $24
+ ADD $f24, $f28, $f2
+ unop
+
+ ST $f3, 0($24)
+ SXADDQ $23, $24, $24
+ ADD $f25, $f29, $f3
+ unop
+
+ LD $f18, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f19, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f20, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f21, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f22, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f23, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f24, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f25, 0($22)
+ SXADDQ $23, $22, $22
+
+ ST $f0, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f1, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f2, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f3, 0($24)
+ SXADDQ $23, $24, $24
+
+ subl $4, 1, $4
+ bgt $4, $SubLoop
+ .align 4
+
+$SubLoopEnd:
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
+ MUL $f30, $f11, $f27
+ MUL $f30, $f12, $f28
+ MUL $f30, $f13, $f29
+
+ ADD $f18, $f26, $f0
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
+ ADD $f19, $f27, $f1
+ MUL $f30, $f15, $f27
+
+ ADD $f20, $f28, $f2
+ MUL $f30, $f16, $f28
+ ADD $f21, $f29, $f3
+ MUL $f30, $f17, $f29
+
+ ST $f0, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f1, 0($24)
+ SXADDQ $23, $24, $24
+
+ ST $f2, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f3, 0($24)
+ SXADDQ $23, $24, $24
+
+ ADD $f22, $f26, $f0
+ ADD $f23, $f27, $f1
+ ADD $f24, $f28, $f2
+ ADD $f25, $f29, $f3
+
+ ST $f0, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f1, 0($24)
+ SXADDQ $23, $24, $24
+
+ ST $f2, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f3, 0($24)
+ SXADDQ $23, $24, $24
+ .align 4
+
+$SubRemain:
+ ble $2, $SubEnd
+ .align 4
+
+$SubRemainLoop:
+ LD $f10, 0($20)
+ LD $f11, 0($24)
+ SXADDQ $21, $20, $20
+
+ MUL $f30, $f10, $f12
+ subl $2, 1, $2
+ ADD $f11, $f12, $f13
+ ST $f13, 0($24)
+ SXADDQ $23, $24, $24
+
+ bgt $2, $SubRemainLoop
+ .align 4
+
+$SubEnd:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ ldi $sp, 16($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/axpy_simd.S b/kernel/sw_64/axpy_simd.S
new file mode 100644
index 0000000..3a2219c
--- /dev/null
+++ b/kernel/sw_64/axpy_simd.S
@@ -0,0 +1,655 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+
+#define PREFETCHSIZE 80
+// #define PREFETCH_DISTANCE_BYTES 384
+
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ ldl $24, 0($sp)
+ fmov $f19, $f30
+ ldl $23, 8($sp)
+ ldi $sp, -16($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fstd $f2, 0($sp)
+ cmpeq $21, 1, $3
+ fstd $f3, 8($sp)
+ cmpeq $23, 1, $4
+
+ ble $16, $End
+ fbeq $f30, $End
+ and $3, $4, $3
+ beq $3, $Sub
+
+/**
+ test the address of Y
+**/
+ and $24, (VEC_LEN*SIZE-1), $4
+ nop
+ nop
+ beq $4, $Align_Y_Access
+ .align 4
+/**
+ process the unalign address of Y
+**/
+
+ sra $16, 4, $1
+ and $16, 15, $2
+ sra $4, BASE_SHIFT, $4
+ ble $1, $Remain /*if N is too small(less then unroll size), don't need process unalign Y. Just jump to remain section.*/
+
+ ldi $3, VEC_LEN
+ subl $3, $4, $4
+
+$UnAlign_Y_Loop:
+ LD $f10, 0*SIZE($20)
+ LD $f11, 0*SIZE($24)
+ addl $20, SIZE, $20
+ addl $24, SIZE, $24
+
+ MAD $f30, $f10, $f11, $f13
+ subl $4, 1, $4
+ subl $16, 1, $16
+ ST $f13, -1*SIZE($24)
+ bgt $4, $UnAlign_Y_Loop
+ .align 4
+
+
+$Align_Y_Access:
+
+ nop
+ sra $16, 4, $1
+ and $16, 15, $2
+ ble $1, $Remain
+
+/**
+ test the address of X
+**/
+
+ and $20, (VEC_LEN*SIZE-1), $3
+ nop
+ nop
+ bne $3, $UnAlign_X_Access
+
+ .align 4
+$Align_Access:
+/***
+ extern alpha from $f30 to vector 4 in $f13
+ unloop 16
+***/
+ vcpyf $f30, $f13
+
+ VLD $f10, 0*VEC_LEN*SIZE($20)
+/*
+ LD $f10, 0*SIZE($20)
+ LD $f11, 1*SIZE($20)
+ LD $f12, 2*SIZE($20)
+ LD $f13, 3*SIZE($20)
+*/
+ VLD $f18, 0*VEC_LEN*SIZE($24)
+/*
+ LD $f18, 0*SIZE($24)
+ LD $f19, 1*SIZE($24)
+ LD $f20, 2*SIZE($24)
+ LD $f21, 3*SIZE($24)
+*/
+ VLD $f14, 1*VEC_LEN*SIZE($20)
+ VLD $f15, 2*VEC_LEN*SIZE($20)
+ VLD $f16, 3*VEC_LEN*SIZE($20)
+/*
+ LD $f14, 4*SIZE($20)
+ LD $f15, 5*SIZE($20)
+ LD $f16, 6*SIZE($20)
+ LD $f17, 7*SIZE($20)
+*/
+ VLD $f22, 1*VEC_LEN*SIZE($24)
+ VLD $f23, 2*VEC_LEN*SIZE($24)
+ VLD $f24, 3*VEC_LEN*SIZE($24)
+/*
+ LD $f22, 4*SIZE($24)
+ LD $f23, 5*SIZE($24)
+ LD $f24, 6*SIZE($24)
+ LD $f25, 7*SIZE($24)
+*/
+
+ subl $1, 1, $1
+ addl $20, 16*SIZE, $20
+ unop
+ ble $1, $LoopEnd
+ .align 4
+
+$Loop:
+
+ fillcs PREFETCHSIZE * SIZE($24)
+ fillcs PREFETCHSIZE * SIZE($20)
+/*
+ fillcs PREFETCH_DISTANCE_BYTES($24)
+ fillcs PREFETCH_DISTANCE_BYTES($20)
+*/
+
+ VMAD $f13, $f10, $f18, $f0
+ VLD $f10, 0*VEC_LEN*SIZE($20)
+ VLD $f18, 4*VEC_LEN*SIZE($24)
+/*
+ MAD $f30, $f10, $f18, $f0 # y += alpha * x
+ LD $f10, 0*SIZE($20)
+ MAD $f30, $f11, $f19, $f1
+ LD $f11, 1*SIZE($20)
+
+ MAD $f30, $f12, $f20, $f2
+ LD $f12, 2*SIZE($20)
+ MAD $f30, $f13, $f21, $f3
+ LD $f13, 3*SIZE($20)
+*/
+
+ VMAD $f13, $f14, $f22, $f26
+ VLD $f14, 1*VEC_LEN*SIZE($20)
+ VLD $f22, 5*VEC_LEN*SIZE($24)
+
+ VMAD $f13, $f15, $f23, $f27
+ VLD $f15, 2*VEC_LEN*SIZE($20)
+ VLD $f23, 6*VEC_LEN*SIZE($24)
+
+ VMAD $f13, $f16, $f24, $f28
+ VLD $f16, 3*VEC_LEN*SIZE($20)
+ VLD $f24, 7*VEC_LEN*SIZE($24)
+/*
+ MAD $f30, $f14, $f22, $f26 # y += alpha * x
+ LD $f14, 4*SIZE($20)
+ MAD $f30, $f15, $f23, $f27
+ LD $f15, 5*SIZE($20)
+
+ MAD $f30, $f16, $f24, $f28
+ LD $f16, 6*SIZE($20)
+ MAD $f30, $f17, $f25, $f29
+ LD $f17, 7*SIZE($20)
+*/
+
+/*
+ LD $f18, 8*SIZE($24)
+ LD $f19, 9*SIZE($24)
+ LD $f20, 10*SIZE($24)
+ LD $f21, 11*SIZE($24)
+
+ LD $f22, 12*SIZE($24)
+ LD $f23, 13*SIZE($24)
+ LD $f24, 14*SIZE($24)
+ LD $f25, 15*SIZE($24)
+*/
+
+
+
+ VST $f0, 0*VEC_LEN*SIZE($24)
+ VST $f26, 1*VEC_LEN*SIZE($24)
+ VST $f27, 2*VEC_LEN*SIZE($24)
+ VST $f28, 3*VEC_LEN*SIZE($24)
+/*
+ ST $f0, 0*SIZE($24)
+ ST $f1, 1*SIZE($24)
+ ST $f2, 2*SIZE($24)
+ ST $f3, 3*SIZE($24)
+
+ ST $f26, 4*SIZE($24)
+ ST $f27, 5*SIZE($24)
+ ST $f28, 6*SIZE($24)
+ ST $f29, 7*SIZE($24)
+*/
+ subl $1, 1, $1
+ addl $24, 16*SIZE, $24
+ addl $20, 16*SIZE, $20
+ bgt $1, $Loop
+ .align 4
+
+$LoopEnd:
+ VMAD $f13, $f10, $f18, $f0
+ VST $f0, 0*VEC_LEN*SIZE($24)
+ VMAD $f13, $f14, $f22, $f26
+ VST $f26, 1*VEC_LEN*SIZE($24)
+ VMAD $f13, $f15, $f23, $f27
+ VST $f27, 2*VEC_LEN*SIZE($24)
+ VMAD $f13, $f16, $f24, $f28
+ VST $f28, 3*VEC_LEN*SIZE($24)
+
+/*
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
+ MUL $f30, $f11, $f27
+ MUL $f30, $f12, $f28
+ MUL $f30, $f13, $f29
+
+ ADD $f18, $f26, $f0
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
+ ADD $f19, $f27, $f1
+ MUL $f30, $f15, $f27
+
+ ADD $f20, $f28, $f2
+ MUL $f30, $f16, $f28
+ ADD $f21, $f29, $f3
+ MUL $f30, $f17, $f29
+
+ ST $f0, 0*SIZE($24)
+ ADD $f22, $f26, $f0
+ ST $f1, 1*SIZE($24)
+ ADD $f23, $f27, $f1
+
+ ST $f2, 2*SIZE($24)
+ ADD $f24, $f28, $f2
+ ST $f3, 3*SIZE($24)
+ ADD $f25, $f29, $f3
+
+ ST $f0, 4*SIZE($24)
+ ST $f1, 5*SIZE($24)
+ ST $f2, 6*SIZE($24)
+ ST $f3, 7*SIZE($24)
+*/
+ addl $24, 16*SIZE, $24
+
+ .align 4
+
+$Remain:
+ ble $2, $End
+
+ .align 4
+
+$RemainLoop:
+ LD $f10, 0*SIZE($20)
+ LD $f11, 0*SIZE($24)
+ addl $20, SIZE, $20
+ addl $24, SIZE, $24
+
+ MAD $f30, $f10, $f11, $f13
+ subl $2, 1, $2
+ ST $f13, -1*SIZE($24)
+ bgt $2, $RemainLoop
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ ldi $sp, 16($sp)
+ ret
+ .align 4
+
+$UnAlign_X_Access:
+/***
+ extern alpha from $f30 to vector 4 in $f13
+ unloop 16
+ unalign access X
+ align access Y
+***/
+ vcpyf $f30, $f13
+ VLD_UL $f10, 0*VEC_LEN*SIZE($20)
+ VLD_UH $f2, 1*VEC_LEN*SIZE($20)
+
+ VLD_UL $f14, 1*VEC_LEN*SIZE($20)
+ VLD_UH $f3, 2*VEC_LEN*SIZE($20)
+
+ VLD_UL $f15, 2*VEC_LEN*SIZE($20)
+ VLD_UH $f11, 3*VEC_LEN*SIZE($20)
+
+ VLD_UL $f16, 3*VEC_LEN*SIZE($20)
+ VLD_UH $f12, 4*VEC_LEN*SIZE($20)
+
+ VLD $f18, 0*VEC_LEN*SIZE($24)
+ VLD $f22, 1*VEC_LEN*SIZE($24)
+ VLD $f23, 2*VEC_LEN*SIZE($24)
+ VLD $f24, 3*VEC_LEN*SIZE($24)
+
+ vbisw $f10, $f2, $f10
+ vbisw $f14, $f3, $f14
+ vbisw $f15, $f11, $f15
+ vbisw $f16, $f12, $f16
+
+
+ subl $1, 1, $1
+ addl $20, 16*SIZE, $20
+ unop
+ ble $1, $UnAlign_X_LoopEnd
+ .align 4
+
+$UnAlign_X_Loop:
+
+ fillcs PREFETCHSIZE * SIZE($24)
+ fillcs PREFETCHSIZE * SIZE($20)
+
+ VMAD $f13, $f10, $f18, $f0
+ VLD_UL $f10, 0*VEC_LEN*SIZE($20)
+ VLD_UH $f2, 1*VEC_LEN*SIZE($20)
+
+
+ VMAD $f13, $f14, $f22, $f26
+ VLD_UL $f14, 1*VEC_LEN*SIZE($20)
+ VLD_UH $f3, 2*VEC_LEN*SIZE($20)
+
+ VMAD $f13, $f15, $f23, $f27
+ VLD_UL $f15, 2*VEC_LEN*SIZE($20)
+ VLD_UH $f11, 3*VEC_LEN*SIZE($20)
+
+ VMAD $f13, $f16, $f24, $f28
+ VLD_UL $f16, 3*VEC_LEN*SIZE($20)
+ VLD_UH $f12, 4*VEC_LEN*SIZE($20)
+
+
+
+
+ VLD $f18, 4*VEC_LEN*SIZE($24)
+ vbisw $f10, $f2, $f10
+ VLD $f22, 5*VEC_LEN*SIZE($24)
+ vbisw $f14, $f3, $f14
+ VLD $f23, 6*VEC_LEN*SIZE($24)
+ vbisw $f15, $f11, $f15
+ VLD $f24, 7*VEC_LEN*SIZE($24)
+ vbisw $f16, $f12, $f16
+
+
+ VST $f0, 0*VEC_LEN*SIZE($24)
+ VST $f26, 1*VEC_LEN*SIZE($24)
+ VST $f27, 2*VEC_LEN*SIZE($24)
+ VST $f28, 3*VEC_LEN*SIZE($24)
+
+
+ subl $1, 1, $1
+ addl $24, 16*SIZE, $24
+ addl $20, 16*SIZE, $20
+ bgt $1, $UnAlign_X_Loop
+ .align 4
+
+$UnAlign_X_LoopEnd:
+ VMAD $f13, $f10, $f18, $f0
+ VST $f0, 0*VEC_LEN*SIZE($24)
+ VMAD $f13, $f14, $f22, $f26
+ VST $f26, 1*VEC_LEN*SIZE($24)
+ VMAD $f13, $f15, $f23, $f27
+ VST $f27, 2*VEC_LEN*SIZE($24)
+ VMAD $f13, $f16, $f24, $f28
+ VST $f28, 3*VEC_LEN*SIZE($24)
+
+ addl $24, 16*SIZE, $24
+
+ .align 4
+
+$UnAlign_X_Remain:
+ ble $2, $UnAlign_X_End
+
+ .align 4
+
+$UnAlign_X_RemainLoop:
+ LD $f10, 0*SIZE($20)
+ LD $f11, 0*SIZE($24)
+ addl $20, SIZE, $20
+ addl $24, SIZE, $24
+
+ MAD $f30, $f10, $f11, $f13
+ subl $2, 1, $2
+ ST $f13, -1*SIZE($24)
+ bgt $2, $UnAlign_X_RemainLoop
+ .align 4
+
+$UnAlign_X_End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ ldi $sp, 16($sp)
+ ret
+ .align 4
+
+
+$Sub:
+ sra $16, 3, $1
+ and $16, 7, $2
+ SXSUBL $16, SIZE, $22
+ subl $1, 1, $4
+
+ ble $1, $SubRemain
+ .align 4
+
+ LD $f10, 0($20)
+ SXADDQ $21, $20, $20
+
+ LD $f11, 0($20)
+ SXADDQ $21, $20, $20
+ LD $f12, 0($20)
+ SXADDQ $21, $20, $20
+
+ LD $f13, 0($20)
+ SXADDQ $21, $20, $20
+ LD $f18, 0($24)
+ SXADDQ $23, $24, $22
+
+ LD $f19, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f20, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f21, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f14, 0($20)
+ SXADDQ $21, $20, $20
+
+ LD $f15, 0($20)
+ SXADDQ $21, $20, $20
+ LD $f16, 0($20)
+ SXADDQ $21, $20, $20
+
+ LD $f17, 0($20)
+ SXADDQ $21, $20, $20
+ LD $f22, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f23, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f24, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f25, 0($22)
+ SXADDQ $23, $22, $22
+ unop
+ ble $4, $SubLoopEnd
+ .align 4
+
+$SubLoop:
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
+ LD $f10, 0($20)
+ unop
+ SXADDQ $21, $20, $20
+
+ MUL $f30, $f11, $f27
+ LD $f11, 0($20)
+ unop
+ SXADDQ $21, $20, $20
+
+ MUL $f30, $f12, $f28
+ LD $f12, 0($20)
+ unop
+ SXADDQ $21, $20, $20
+
+ MUL $f30, $f13, $f29
+ LD $f13, 0($20)
+ unop
+ SXADDQ $21, $20, $20
+
+ ADD $f18, $f26, $f0
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
+ LD $f14, 0($20)
+ SXADDQ $21, $20, $20
+
+ ADD $f19, $f27, $f1
+ MUL $f30, $f15, $f27
+ LD $f15, 0($20)
+ SXADDQ $21, $20, $20
+
+ ADD $f20, $f28, $f2
+ MUL $f30, $f16, $f28
+ LD $f16, 0($20)
+ SXADDQ $21, $20, $20
+
+ ADD $f21, $f29, $f3
+ MUL $f30, $f17, $f29
+ LD $f17, 0($20)
+ SXADDQ $21, $20, $20
+
+ ST $f0, 0($24)
+ SXADDQ $23, $24, $24
+ ADD $f22, $f26, $f0
+ unop
+
+ ST $f1, 0($24)
+ SXADDQ $23, $24, $24
+ ADD $f23, $f27, $f1
+ unop
+
+ ST $f2, 0($24)
+ SXADDQ $23, $24, $24
+ ADD $f24, $f28, $f2
+ unop
+
+ ST $f3, 0($24)
+ SXADDQ $23, $24, $24
+ ADD $f25, $f29, $f3
+ unop
+
+ LD $f18, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f19, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f20, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f21, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f22, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f23, 0($22)
+ SXADDQ $23, $22, $22
+
+ LD $f24, 0($22)
+ SXADDQ $23, $22, $22
+ LD $f25, 0($22)
+ SXADDQ $23, $22, $22
+
+ ST $f0, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f1, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f2, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f3, 0($24)
+ SXADDQ $23, $24, $24
+
+ subl $4, 1, $4
+ bgt $4, $SubLoop
+ .align 4
+
+$SubLoopEnd:
+ MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
+ MUL $f30, $f11, $f27
+ MUL $f30, $f12, $f28
+ MUL $f30, $f13, $f29
+
+ ADD $f18, $f26, $f0
+ MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
+ ADD $f19, $f27, $f1
+ MUL $f30, $f15, $f27
+
+ ADD $f20, $f28, $f2
+ MUL $f30, $f16, $f28
+ ADD $f21, $f29, $f3
+ MUL $f30, $f17, $f29
+
+ ST $f0, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f1, 0($24)
+ SXADDQ $23, $24, $24
+
+ ST $f2, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f3, 0($24)
+ SXADDQ $23, $24, $24
+
+ ADD $f22, $f26, $f0
+ ADD $f23, $f27, $f1
+ ADD $f24, $f28, $f2
+ ADD $f25, $f29, $f3
+
+ ST $f0, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f1, 0($24)
+ SXADDQ $23, $24, $24
+
+ ST $f2, 0($24)
+ SXADDQ $23, $24, $24
+ ST $f3, 0($24)
+ SXADDQ $23, $24, $24
+ .align 4
+
+$SubRemain:
+ ble $2, $SubEnd
+ .align 4
+
+$SubRemainLoop:
+ LD $f10, 0($20)
+ LD $f11, 0($24)
+ SXADDQ $21, $20, $20
+
+ MUL $f30, $f10, $f12
+ subl $2, 1, $2
+ ADD $f11, $f12, $f13
+ ST $f13, 0($24)
+ SXADDQ $23, $24, $24
+
+ bgt $2, $SubRemainLoop
+ .align 4
+
+$SubEnd:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ ldi $sp, 16($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/cabs.S b/kernel/sw_64/cabs.S
new file mode 100644
index 0000000..3f9ed2c
--- /dev/null
+++ b/kernel/sw_64/cabs.S
@@ -0,0 +1,72 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+ .set noat
+ .set noreorder
+.text
+ .align 5
+ .globl NAME
+ .ent NAME
+NAME:
+ .frame $sp, 0, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $28, _mcount
+ jsr $28, ($28), _mcount
+#endif
+
+ LD $f10, 0($16)
+ LD $f11, SIZE($16)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fabs $f10, $f12
+ fabs $f11, $f0
+ ADD $f12, $f0, $f29
+ fmov $f29, $f0
+ ret
+ .end NAME
+ .ident VERSION
diff --git a/kernel/sw_64/cabs.S.bak b/kernel/sw_64/cabs.S.bak
new file mode 100644
index 0000000..5fa27af
--- /dev/null
+++ b/kernel/sw_64/cabs.S.bak
@@ -0,0 +1,71 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+ .set noat
+ .set noreorder
+.text
+ .align 5
+ .globl NAME
+ .ent NAME
+NAME:
+ .frame $sp, 0, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ lda $28, _mcount
+ jsr $28, ($28), _mcount
+#endif
+
+ LD $f10, 0($16)
+ LD $f11, SIZE($16)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fabs $f10, $f12
+ fabs $f11, $f0
+ ADD $f12, $f0, $f0
+ ret
+ .end NAME
+ .ident VERSION
diff --git a/kernel/sw_64/cnrm2.S b/kernel/sw_64/cnrm2.S
new file mode 100644
index 0000000..25eab03
--- /dev/null
+++ b/kernel/sw_64/cnrm2.S
@@ -0,0 +1,440 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCH_SIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+#if defined(EV4) || defined(EV5)
+ .frame $30,16,$26,0
+ .mask 0x4000000,-16
+ ldih $29, 0($27) !gpdisp!1
+ ldi $29, 0($29) !gpdisp!1
+
+ ldi $sp, -16($sp)
+ ldl $27, sqrt($29) !literal!2
+ stl $26, 0($sp)
+
+ PROFCODE
+ .prologue 1
+#else
+ PROFCODE
+#endif
+
+ fclr a0
+ sll INCX, ZBASE_SHIFT, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, 2 * SIZE, $0
+ fclr a3
+ beq $0, $L20
+
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ faddd a0, t0, $f25
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, $f26
+ mov X, XX
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, $f27
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, $f28
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd $f25, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ faddd $f26, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ faddd $f27, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ faddd $f28, t3, a3
+ unop
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ faddd a0, t0, $f25
+ unop
+ fmuld x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ faddd a1, t1, $f26
+ ldi X, 16 * SIZE(X)
+ fmuld x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ faddd a2, t2, $f27
+ unop
+ fmuld x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ faddd a3, t3, $f28
+ unop
+ fmuld x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ faddd $f25, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ faddd $f26, t1, a1
+ ldi I, -1(I)
+ fmuld x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ faddd $f27, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ faddd $f28, t3, a3
+ fmuld x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ faddd a0, t0, $f25
+ mov X, XX
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, $f26
+ unop
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, $f27
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, $f28
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd $f25, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ faddd $f26, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ faddd $f27, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ faddd $f28, t3, a3
+ ldi X, 16 * SIZE(X)
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ faddd a0, t0, $f25
+ fmuld x0, x0, t0
+ faddd a1, t1, $f26
+ fmuld x1, x1, t1
+
+ faddd a2, t2, $f27
+ fmuld x2, x2, t2
+ faddd a3, t3, $f28
+ fmuld x3, x3, t3
+
+ faddd $f25, t0, a0
+ fmuld x4, x4, t0
+ faddd $f26, t1, a1
+ fmuld x5, x5, t1
+
+ faddd $f27, t2, a2
+ fmuld x6, x6, t2
+ faddd $f28, t3, a2
+ fmuld x7, x7, t3
+
+ faddd a2, t2, $f27
+ fmov $f27, a2
+ faddd a3, t3, $f28
+ fmov $f28, a3
+ .align 4
+
+$L15:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ LD x1, 1 * SIZE(X)
+
+ ldi X, 2 * SIZE(X)
+
+ faddd a0, t0, $f25
+ fmov $f25, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, $f26
+ fmov $f26, a1
+ fmuld x1, x1, t1
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 2, I
+ fclr t1
+ ble I, $L25
+
+ LD x0, 0 * SIZE(X)
+ fclr t2
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ fclr t3
+ LD x3, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x5, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x6, 0 * SIZE(X)
+ ble I, $L22
+ .align 4
+
+$L21:
+ faddd a0, t0, $f25
+ LD x7, 1 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, $f26
+ LD x0, 0 * SIZE(X)
+ fmuld x1, x1, t1
+ unop
+
+ faddd a2, t2, $f27
+ LD x1, 1 * SIZE(X)
+ fmuld x2, x2, t2
+ addl X, INCX, X
+
+ faddd a3, t3, $f28
+ LD x2, 0 * SIZE(X)
+ fmuld x3, x3, t3
+ unop
+
+ faddd $f25, t0, a0
+ LD x3, 1 * SIZE(X)
+ fmuld x4, x4, t0
+ addl X, INCX, X
+
+ faddd $f26, t1, a1
+ LD x4, 0 * SIZE(X)
+ fmuld x5, x5, t1
+ ldi I, -1(I)
+
+ faddd $f27, t2, a2
+ LD x5, 1 * SIZE(X)
+ fmuld x6, x6, t2
+ addl X, INCX, X
+
+ faddd $f28, t3, a3
+ LD x6, 0 * SIZE(X)
+ fmuld x7, x7, t3
+ bgt I, $L21
+ .align 4
+
+$L22:
+ faddd a0, t0, $f25
+ LD x7, 1 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, $f26
+ fmuld x1, x1, t1
+ faddd a2, t2, $f27
+ fmuld x2, x2, t2
+
+ faddd a3, t3, $f28
+ fmuld x3, x3, t3
+ faddd $f25, t0, a0
+ fmuld x4, x4, t0
+
+ faddd $f26, t1, a1
+ fmuld x5, x5, t1
+ faddd $f27, t2, a2
+ fmuld x6, x6, t2
+
+ faddd $f28, t3, a3
+ fmuld x7, x7, t3
+ faddd a2, t2, $f27
+ fmov $f27, a2
+ faddd a3, t3, $f28
+ fmov $f28, a3
+ .align 4
+
+$L25:
+ and N, 3, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+
+ faddd a0, t0, $f25
+ fmov $f25, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, $f26
+ fmov $f26, a1
+ fmuld x1, x1, t1
+
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ faddd a0, t0, $f25
+ fmov $f25, a0
+ faddd a1, t1, $f26
+ fmov $f26, a1
+
+ faddd a0, a1, $f25
+ fmov $f25, a0
+ faddd a2, a3, $f26
+ fmov $f26, a2
+
+#if defined(EV4) || defined(EV5)
+ faddd a0, a2, $f16
+ jsr $26, ($27), sqrt !lituse_jsr!2
+
+ ldih $29, 0($26) !gpdisp!3
+ ldi $29, 0($29) !gpdisp!3
+#else
+ faddd a0, a2, $f25
+ fmov $f25, a0
+ fsqrtd a0, $f25
+ fmov $f25, a0
+#endif
+ .align 4
+
+$L999:
+#if defined(EV4) || defined(EV5)
+ ldl $26, 0($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/cnrm2.S.bak b/kernel/sw_64/cnrm2.S.bak
new file mode 100644
index 0000000..b2e80e0
--- /dev/null
+++ b/kernel/sw_64/cnrm2.S.bak
@@ -0,0 +1,426 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCH_SIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+#if defined(EV4) || defined(EV5)
+ .frame $30,16,$26,0
+ .mask 0x4000000,-16
+ ldih $29, 0($27) !gpdisp!1
+ ldi $29, 0($29) !gpdisp!1
+
+ ldi $sp, -16($sp)
+ ldl $27, sqrt($29) !literal!2
+ stq $26, 0($sp)
+
+ PROFCODE
+ .prologue 1
+#else
+ PROFCODE
+#endif
+
+ fclr a0
+ sll INCX, ZBASE_SHIFT, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, 2 * SIZE, $0
+ fclr a3
+ beq $0, $L20
+
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ faddd a0, t0, a0
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, a1
+ mov X, XX
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ faddd a1, t1, a1
+ ldi X, 16 * SIZE(X)
+ fmuld x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ faddd a1, t1, a1
+ ldi I, -1(I)
+ fmuld x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ faddd a0, t0, a0
+ mov X, XX
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ faddd a3, t3, a3
+ ldi X, 16 * SIZE(X)
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+
+ faddd a2, t2, a2
+ fmuld x2, x2, t2
+ faddd a3, t3, a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, a0
+ fmuld x4, x4, t0
+ faddd a1, t1, a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, a2
+ fmuld x6, x6, t2
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+
+ faddd a2, t2, a2
+ faddd a3, t3, a3
+ .align 4
+
+$L15:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ LD x1, 1 * SIZE(X)
+
+ ldi X, 2 * SIZE(X)
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 2, I
+ fclr t1
+ ble I, $L25
+
+ LD x0, 0 * SIZE(X)
+ fclr t2
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ fclr t3
+ LD x3, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x5, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x6, 0 * SIZE(X)
+ ble I, $L22
+ .align 4
+
+$L21:
+ faddd a0, t0, a0
+ LD x7, 1 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ LD x0, 0 * SIZE(X)
+ fmuld x1, x1, t1
+ unop
+
+ faddd a2, t2, a2
+ LD x1, 1 * SIZE(X)
+ fmuld x2, x2, t2
+ addl X, INCX, X
+
+ faddd a3, t3, a3
+ LD x2, 0 * SIZE(X)
+ fmuld x3, x3, t3
+ unop
+
+ faddd a0, t0, a0
+ LD x3, 1 * SIZE(X)
+ fmuld x4, x4, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ LD x4, 0 * SIZE(X)
+ fmuld x5, x5, t1
+ ldi I, -1(I)
+
+ faddd a2, t2, a2
+ LD x5, 1 * SIZE(X)
+ fmuld x6, x6, t2
+ addl X, INCX, X
+
+ faddd a3, t3, a3
+ LD x6, 0 * SIZE(X)
+ fmuld x7, x7, t3
+ bgt I, $L21
+ .align 4
+
+$L22:
+ faddd a0, t0, a0
+ LD x7, 1 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+ faddd a2, t2, a2
+ fmuld x2, x2, t2
+
+ faddd a3, t3, a3
+ fmuld x3, x3, t3
+ faddd a0, t0, a0
+ fmuld x4, x4, t0
+
+ faddd a1, t1, a1
+ fmuld x5, x5, t1
+ faddd a2, t2, a2
+ fmuld x6, x6, t2
+
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+ faddd a2, t2, a2
+ faddd a3, t3, a3
+ .align 4
+
+$L25:
+ and N, 3, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ faddd a0, t0, a0
+ faddd a1, t1, a1
+
+ faddd a0, a1, a0
+ faddd a2, a3, a2
+
+#if defined(EV4) || defined(EV5)
+ faddd a0, a2, $f16
+ jsr $26, ($27), sqrt !lituse_jsr!2
+
+ ldih $29, 0($26) !gpdisp!3
+ ldi $29, 0($29) !gpdisp!3
+#else
+ faddd a0, a2, a0
+ fsqrtd a0, a0
+#endif
+ .align 4
+
+$L999:
+#if defined(EV4) || defined(EV5)
+ ldl $26, 0($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/copy.S b/kernel/sw_64/copy.S
new file mode 100644
index 0000000..c960ac1
--- /dev/null
+++ b/kernel/sw_64/copy.S
@@ -0,0 +1,379 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ cmpeq INCX, 1, $0
+ ble N, $End
+#ifndef COMPLEX
+ sra N, 4, $4
+#else
+ sra N, 3, $4
+#endif
+ cmpeq INCY, 1, $1
+
+ and $0, $1, $0
+ beq $0, $Sub
+#ifndef COMPLEX
+ and N, 15, $5
+#else
+ and N, 7, $5
+#endif
+ ble $4, $Remain
+
+ LD $f10, 0*SIZE(X)
+ LD $f11, 1*SIZE(X)
+ LD $f12, 2*SIZE(X)
+ LD $f13, 3*SIZE(X)
+ LD $f14, 4*SIZE(X)
+ LD $f15, 5*SIZE(X)
+ LD $f16, 6*SIZE(X)
+ LD $f17, 7*SIZE(X)
+
+ LD $f18, 8*SIZE(X)
+ LD $f19, 9*SIZE(X)
+ LD $f20, 10*SIZE(X)
+ LD $f21, 11*SIZE(X)
+ LD $f22, 12*SIZE(X)
+ LD $f23, 13*SIZE(X)
+ LD $f24, 14*SIZE(X)
+ LD $f25, 15*SIZE(X)
+
+ subl $4, 1, $4
+ ldi X, 16*SIZE(X)
+ ble $4, $MainLoopEnd
+ .align 4
+
+$MainLoop:
+ ST $f10, 0*SIZE(Y)
+ ST $f11, 1*SIZE(Y)
+ ST $f12, 2*SIZE(Y)
+ ST $f13, 3*SIZE(Y)
+
+ LD $f10, 0*SIZE(X)
+ LD $f11, 1*SIZE(X)
+ LD $f12, 2*SIZE(X)
+ LD $f13, 3*SIZE(X)
+
+ ST $f14, 4*SIZE(Y)
+ ST $f15, 5*SIZE(Y)
+ ST $f16, 6*SIZE(Y)
+ ST $f17, 7*SIZE(Y)
+
+ LD $f14, 4*SIZE(X)
+ LD $f15, 5*SIZE(X)
+ LD $f16, 6*SIZE(X)
+ LD $f17, 7*SIZE(X)
+
+ ST $f18, 8*SIZE(Y)
+ ST $f19, 9*SIZE(Y)
+ ST $f20, 10*SIZE(Y)
+ ST $f21, 11*SIZE(Y)
+
+ LD $f18, 8*SIZE(X)
+ LD $f19, 9*SIZE(X)
+ LD $f20, 10*SIZE(X)
+ LD $f21, 11*SIZE(X)
+
+ ST $f22, 12*SIZE(Y)
+ ST $f23, 13*SIZE(Y)
+ ST $f24, 14*SIZE(Y)
+ ST $f25, 15*SIZE(Y)
+
+ LD $f22, 12*SIZE(X)
+ LD $f23, 13*SIZE(X)
+ LD $f24, 14*SIZE(X)
+ LD $f25, 15*SIZE(X)
+
+ subl $4, 1, $4
+ ldi Y, 16*SIZE(Y)
+ ldi X, 16*SIZE(X)
+ bgt $4, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+ ST $f10, 0*SIZE(Y)
+ ST $f11, 1*SIZE(Y)
+ ST $f12, 2*SIZE(Y)
+ ST $f13, 3*SIZE(Y)
+ ST $f14, 4*SIZE(Y)
+ ST $f15, 5*SIZE(Y)
+ ST $f16, 6*SIZE(Y)
+ ST $f17, 7*SIZE(Y)
+
+ ST $f18, 8*SIZE(Y)
+ ST $f19, 9*SIZE(Y)
+ ST $f20, 10*SIZE(Y)
+ ST $f21, 11*SIZE(Y)
+ ST $f22, 12*SIZE(Y)
+ ST $f23, 13*SIZE(Y)
+ ST $f24, 14*SIZE(Y)
+ ST $f25, 15*SIZE(Y)
+
+ ldi Y, 16*SIZE(Y)
+ .align 4
+
+$Remain:
+ ble $5, $End
+ .align 4
+
+$RemainLoop:
+#ifndef COMPLEX
+ LD $f10, 0*SIZE(X)
+ ldi X, 1*SIZE(X)
+ ST $f10, 0*SIZE(Y)
+ ldi Y, 1*SIZE(Y)
+#else
+ LD $f10, 0*SIZE(X)
+ LD $f11, 1*SIZE(X)
+ ldi X, 2*SIZE(X)
+ ST $f10, 0*SIZE(Y)
+ ST $f11, 1*SIZE(Y)
+ ldi Y, 2*SIZE(Y)
+#endif
+ subl $5, 1, $5
+ bgt $5, $RemainLoop
+ .align 4
+$End:
+ ret
+ .align 4
+
+$Sub:
+#ifdef COMPLEX
+ addl INCX, INCX, INCX
+ addl INCY, INCY, INCY
+ and N, 7, $5
+#else
+ and N, 15, $5
+#endif
+ ble $4, $SubRemain
+ .align 4
+
+$SubMainLoop:
+#ifndef COMPLEX
+ LD $f10, 0(X)
+ SXADDQ INCX, X, X
+ LD $f11, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f12, 0(X)
+ SXADDQ INCX, X, X
+ LD $f13, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f14, 0(X)
+ SXADDQ INCX, X, X
+ LD $f15, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f16, 0(X)
+ SXADDQ INCX, X, X
+ LD $f17, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f18, 0(X)
+ SXADDQ INCX, X, X
+ LD $f19, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f20, 0(X)
+ SXADDQ INCX, X, X
+ LD $f21, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f22, 0(X)
+ SXADDQ INCX, X, X
+ LD $f23, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f24, 0(X)
+ SXADDQ INCX, X, X
+ LD $f25, 0(X)
+ SXADDQ INCX, X, X
+
+ ST $f10, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f11, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f12, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f13, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f14, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f15, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f16, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f17, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f18, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f19, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f20, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f21, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f22, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f23, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f24, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f25, 0(Y)
+ SXADDQ INCY, Y, Y
+#else
+ LD $f10, 0(X)
+ LD $f11, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f12, 0(X)
+ LD $f13, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f14, 0(X)
+ LD $f15, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f16, 0(X)
+ LD $f17, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f18, 0(X)
+ LD $f19, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f20, 0(X)
+ LD $f21, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f22, 0(X)
+ LD $f23, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f24, 0(X)
+ LD $f25, SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST $f10, 0(Y)
+ ST $f11, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f12, 0(Y)
+ ST $f13, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f14, 0(Y)
+ ST $f15, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f16, 0(Y)
+ ST $f17, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f18, 0(Y)
+ ST $f19, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f20, 0(Y)
+ ST $f21, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f22, 0(Y)
+ ST $f23, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f24, 0(Y)
+ ST $f25, SIZE(Y)
+ SXADDQ INCY, Y, Y
+#endif
+ subl $4, 1, $4
+ bgt $4, $SubMainLoop
+ .align 4
+
+$SubRemain:
+ ble $5, $SubEnd
+ .align 4
+
+ $SubRemainLoop:
+#ifndef COMPLEX
+ LD $f10, 0(X)
+ SXADDQ INCX, X, X
+ ST $f10, 0(Y)
+ SXADDQ INCY, Y, Y
+#else
+ LD $f10, 0(X)
+ LD $f11, SIZE(X)
+ SXADDQ INCX, X, X
+ ST $f10, 0(Y)
+ ST $f11, SIZE(Y)
+ SXADDQ INCY, Y, Y
+#endif
+ subl $5, 1, $5
+ bgt $5, $SubRemainLoop
+ .align 4
+
+$SubEnd:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/copy_simd.S b/kernel/sw_64/copy_simd.S
new file mode 100644
index 0000000..84e96a9
--- /dev/null
+++ b/kernel/sw_64/copy_simd.S
@@ -0,0 +1,563 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ cmpeq INCX, 1, $0
+ ble N, $End
+#ifndef COMPLEX
+ sra N, 4, $4
+#else
+ sra N, 3, $4
+#endif
+ cmpeq INCY, 1, $1
+
+ and $0, $1, $0
+ beq $0, $Sub
+#ifndef COMPLEX
+ and N, 15, $5
+#else
+ and N, 7, $5
+#endif
+ ble $4, $Remain
+
+/**
+ test the address of X & Y
+**/
+
+ and Y, (VEC_LEN*SIZE-1), $6
+ and X, (VEC_LEN*SIZE-1), $7
+ bgt $6, $UnAlign_Y_ACCESS
+ bgt $7, $UnAlign_X_ACCESS
+
+ .align 4
+
+$Align:
+ VLD $f10, 0*VEC_LEN*SIZE(X)
+ VLD $f11, 1*VEC_LEN*SIZE(X)
+ VLD $f12, 2*VEC_LEN*SIZE(X)
+ VLD $f13, 3*VEC_LEN*SIZE(X)
+
+ subl $4, 1, $4
+ ldi X, 16*SIZE(X)
+ ble $4, $MainLoopEnd
+ .align 4
+
+$MainLoop:
+ fillcs PREFETCHSIZE * SIZE(X)
+ fillcs PREFETCHSIZE * SIZE(Y)
+
+ VST $f10, 0*VEC_LEN*SIZE(Y)
+ VST $f11, 1*VEC_LEN*SIZE(Y)
+ VST $f12, 2*VEC_LEN*SIZE(Y)
+ VST $f13, 3*VEC_LEN*SIZE(Y)
+
+ VLD $f10, 0*VEC_LEN*SIZE(X)
+ VLD $f11, 1*VEC_LEN*SIZE(X)
+ VLD $f12, 2*VEC_LEN*SIZE(X)
+ VLD $f13, 3*VEC_LEN*SIZE(X)
+
+ subl $4, 1, $4
+ ldi Y, 16*SIZE(Y)
+ ldi X, 16*SIZE(X)
+ bgt $4, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+
+ VST $f10, 0*VEC_LEN*SIZE(Y)
+ VST $f11, 1*VEC_LEN*SIZE(Y)
+ VST $f12, 2*VEC_LEN*SIZE(Y)
+ VST $f13, 3*VEC_LEN*SIZE(Y)
+
+ ldi Y, 16*SIZE(Y)
+ .align 4
+
+$Remain:
+ ble $5, $End
+ .align 4
+
+$RemainLoop:
+#ifndef COMPLEX
+ LD $f10, 0*SIZE(X)
+ ldi X, 1*SIZE(X)
+ ST $f10, 0*SIZE(Y)
+ ldi Y, 1*SIZE(Y)
+#else
+ LD $f10, 0*SIZE(X)
+ LD $f11, 1*SIZE(X)
+ ldi X, 2*SIZE(X)
+ ST $f10, 0*SIZE(Y)
+ ST $f11, 1*SIZE(Y)
+ ldi Y, 2*SIZE(Y)
+#endif
+ subl $5, 1, $5
+ bgt $5, $RemainLoop
+ .align 4
+$End:
+ ret
+ .align 4
+
+$UnAlign_X_ACCESS:
+ and Y, (VEC_LEN*SIZE-1), $7
+ nop
+ nop
+ bgt $7, $UnAlign_XY_ACCESS
+ .align 4
+
+ VLD_UL $f10, 0*VEC_LEN*SIZE(X)
+ VLD_UH $f14, 1*VEC_LEN*SIZE(X)
+
+ VLD_UL $f11, 1*VEC_LEN*SIZE(X)
+ VLD_UH $f15, 2*VEC_LEN*SIZE(X)
+
+ VLD_UL $f12, 2*VEC_LEN*SIZE(X)
+ VLD_UH $f16, 3*VEC_LEN*SIZE(X)
+
+
+ VLD_UL $f13, 3*VEC_LEN*SIZE(X)
+ VLD_UH $f17, 4*VEC_LEN*SIZE(X)
+
+ subl $4, 1, $4
+ vbisw $f10, $f14, $f10
+ ldi X, 16*SIZE(X)
+ vbisw $f11, $f15, $f11
+
+ vbisw $f12, $f16, $f12
+ vbisw $f13, $f17, $f13
+ nop
+ ble $4, $UnAlign_X_MainLoopEnd
+ .align 4
+
+$UnAlign_X_MainLoop:
+ fillcs PREFETCHSIZE * SIZE(X)
+ fillcs PREFETCHSIZE * SIZE(Y)
+
+ VST $f10, 0*VEC_LEN*SIZE(Y)
+ VST $f11, 1*VEC_LEN*SIZE(Y)
+ VST $f12, 2*VEC_LEN*SIZE(Y)
+ VST $f13, 3*VEC_LEN*SIZE(Y)
+
+ VLD_UL $f10, 0*VEC_LEN*SIZE(X)
+ VLD_UH $f14, 1*VEC_LEN*SIZE(X)
+ VLD_UL $f11, 1*VEC_LEN*SIZE(X)
+ VLD_UH $f15, 2*VEC_LEN*SIZE(X)
+
+ VLD_UL $f12, 2*VEC_LEN*SIZE(X)
+ VLD_UH $f16, 3*VEC_LEN*SIZE(X)
+ VLD_UL $f13, 3*VEC_LEN*SIZE(X)
+ VLD_UH $f17, 4*VEC_LEN*SIZE(X)
+
+ subl $4, 1, $4
+ vbisw $f10, $f14, $f10
+ ldi Y, 16*SIZE(Y)
+ vbisw $f11, $f15, $f11
+
+ vbisw $f12, $f16, $f12
+ ldi X, 16*SIZE(X)
+ vbisw $f13, $f17, $f13
+ bgt $4, $UnAlign_X_MainLoop
+ .align 4
+
+$UnAlign_X_MainLoopEnd:
+
+ VST $f10, 0*VEC_LEN*SIZE(Y)
+ VST $f11, 1*VEC_LEN*SIZE(Y)
+ VST $f12, 2*VEC_LEN*SIZE(Y)
+ VST $f13, 3*VEC_LEN*SIZE(Y)
+
+ ldi Y, 16*SIZE(Y)
+ ble $5, $End
+ jmp $RemainLoop
+
+ .align 4
+
+$UnAlign_Y_ACCESS:
+ and X, (VEC_LEN*SIZE-1), $7
+ nop
+ nop
+ bgt $7, $UnAlign_XY_ACCESS
+ .align 4
+
+ VLD $f10, 0*VEC_LEN*SIZE(X)
+ VLD $f11, 1*VEC_LEN*SIZE(X)
+ VLD $f12, 2*VEC_LEN*SIZE(X)
+ VLD $f13, 3*VEC_LEN*SIZE(X)
+
+ subl $4, 1, $4
+ ldi X, 16*SIZE(X)
+ ble $4, $UnAlign_Y_MainLoopEnd
+ .align 4
+
+$UnAlign_Y_MainLoop:
+ fillcs PREFETCHSIZE * SIZE(X)
+ fillcs PREFETCHSIZE * SIZE(Y)
+
+ VST_UL $f10, 0*VEC_LEN*SIZE(Y)
+ VST_UH $f10, 1*VEC_LEN*SIZE(Y)
+
+ VST_UL $f11, 1*VEC_LEN*SIZE(Y)
+ VST_UH $f11, 2*VEC_LEN*SIZE(Y)
+
+ VST_UL $f12, 2*VEC_LEN*SIZE(Y)
+ VST_UH $f12, 3*VEC_LEN*SIZE(Y)
+
+ VST_UL $f13, 3*VEC_LEN*SIZE(Y)
+ VST_UH $f13, 4*VEC_LEN*SIZE(Y)
+
+ VLD $f10, 0*VEC_LEN*SIZE(X)
+ VLD $f11, 1*VEC_LEN*SIZE(X)
+ VLD $f12, 2*VEC_LEN*SIZE(X)
+ VLD $f13, 3*VEC_LEN*SIZE(X)
+
+ subl $4, 1, $4
+ ldi Y, 16*SIZE(Y)
+ ldi X, 16*SIZE(X)
+ bgt $4, $UnAlign_Y_MainLoop
+ .align 4
+
+$UnAlign_Y_MainLoopEnd:
+
+ VST_UL $f10, 0*VEC_LEN*SIZE(Y)
+ VST_UH $f10, 1*VEC_LEN*SIZE(Y)
+
+ VST_UL $f11, 1*VEC_LEN*SIZE(Y)
+ VST_UH $f11, 2*VEC_LEN*SIZE(Y)
+
+ VST_UL $f12, 2*VEC_LEN*SIZE(Y)
+ VST_UH $f12, 3*VEC_LEN*SIZE(Y)
+
+ VST_UL $f13, 3*VEC_LEN*SIZE(Y)
+ VST_UH $f13, 4*VEC_LEN*SIZE(Y)
+
+ ldi Y, 16*SIZE(Y)
+ ble $5, $End
+ jmp $RemainLoop
+
+ .align 4
+
+$UnAlign_XY_ACCESS:
+
+ VLD_UL $f10, 0*VEC_LEN*SIZE(X)
+ VLD_UH $f14, 1*VEC_LEN*SIZE(X)
+
+ VLD_UL $f11, 1*VEC_LEN*SIZE(X)
+ VLD_UH $f15, 2*VEC_LEN*SIZE(X)
+
+ VLD_UL $f12, 2*VEC_LEN*SIZE(X)
+ VLD_UH $f16, 3*VEC_LEN*SIZE(X)
+
+
+ VLD_UL $f13, 3*VEC_LEN*SIZE(X)
+ VLD_UH $f17, 4*VEC_LEN*SIZE(X)
+
+ subl $4, 1, $4
+ vbisw $f10, $f14, $f10
+ ldi X, 16*SIZE(X)
+ vbisw $f11, $f15, $f11
+
+ vbisw $f12, $f16, $f12
+ vbisw $f13, $f17, $f13
+ nop
+ ble $4, $UnAlign_XY_MainLoopEnd
+ .align 4
+
+$UnAlign_XY_MainLoop:
+ fillcs PREFETCHSIZE * SIZE(X)
+ fillcs PREFETCHSIZE * SIZE(Y)
+
+ VST_UL $f10, 0*VEC_LEN*SIZE(Y)
+ VST_UH $f10, 1*VEC_LEN*SIZE(Y)
+
+ VST_UL $f11, 1*VEC_LEN*SIZE(Y)
+ VST_UH $f11, 2*VEC_LEN*SIZE(Y)
+
+ VST_UL $f12, 2*VEC_LEN*SIZE(Y)
+ VST_UH $f12, 3*VEC_LEN*SIZE(Y)
+
+ VST_UL $f13, 3*VEC_LEN*SIZE(Y)
+ VST_UH $f13, 4*VEC_LEN*SIZE(Y)
+
+
+ VLD_UL $f10, 0*VEC_LEN*SIZE(X)
+ VLD_UH $f14, 1*VEC_LEN*SIZE(X)
+ VLD_UL $f11, 1*VEC_LEN*SIZE(X)
+ VLD_UH $f15, 2*VEC_LEN*SIZE(X)
+
+ VLD_UL $f12, 2*VEC_LEN*SIZE(X)
+ VLD_UH $f16, 3*VEC_LEN*SIZE(X)
+ VLD_UL $f13, 3*VEC_LEN*SIZE(X)
+ VLD_UH $f17, 4*VEC_LEN*SIZE(X)
+
+ subl $4, 1, $4
+ vbisw $f10, $f14, $f10
+ ldi Y, 16*SIZE(Y)
+ vbisw $f11, $f15, $f11
+
+ vbisw $f12, $f16, $f12
+ ldi X, 16*SIZE(X)
+ vbisw $f13, $f17, $f13
+ bgt $4, $UnAlign_XY_MainLoop
+ .align 4
+
+$UnAlign_XY_MainLoopEnd:
+
+ VST_UL $f10, 0*VEC_LEN*SIZE(Y)
+ VST_UH $f10, 1*VEC_LEN*SIZE(Y)
+
+ VST_UL $f11, 1*VEC_LEN*SIZE(Y)
+ VST_UH $f11, 2*VEC_LEN*SIZE(Y)
+
+ VST_UL $f12, 2*VEC_LEN*SIZE(Y)
+ VST_UH $f12, 3*VEC_LEN*SIZE(Y)
+
+ VST_UL $f13, 3*VEC_LEN*SIZE(Y)
+ VST_UH $f13, 4*VEC_LEN*SIZE(Y)
+
+ ldi Y, 16*SIZE(Y)
+ ble $5, $End
+ jmp $RemainLoop
+
+ .align 4
+
+$Sub:
+#ifdef COMPLEX
+ addl INCX, INCX, INCX
+ addl INCY, INCY, INCY
+ and N, 7, $5
+#else
+ and N, 15, $5
+#endif
+ ble $4, $SubRemain
+ .align 4
+
+$SubMainLoop:
+#ifndef COMPLEX
+ LD $f10, 0(X)
+ SXADDQ INCX, X, X
+ LD $f11, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f12, 0(X)
+ SXADDQ INCX, X, X
+ LD $f13, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f14, 0(X)
+ SXADDQ INCX, X, X
+ LD $f15, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f16, 0(X)
+ SXADDQ INCX, X, X
+ LD $f17, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f18, 0(X)
+ SXADDQ INCX, X, X
+ LD $f19, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f20, 0(X)
+ SXADDQ INCX, X, X
+ LD $f21, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f22, 0(X)
+ SXADDQ INCX, X, X
+ LD $f23, 0(X)
+ SXADDQ INCX, X, X
+
+ LD $f24, 0(X)
+ SXADDQ INCX, X, X
+ LD $f25, 0(X)
+ SXADDQ INCX, X, X
+
+ ST $f10, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f11, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f12, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f13, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f14, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f15, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f16, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f17, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f18, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f19, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f20, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f21, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f22, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f23, 0(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f24, 0(Y)
+ SXADDQ INCY, Y, Y
+ ST $f25, 0(Y)
+ SXADDQ INCY, Y, Y
+#else
+ LD $f10, 0(X)
+ LD $f11, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f12, 0(X)
+ LD $f13, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f14, 0(X)
+ LD $f15, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f16, 0(X)
+ LD $f17, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f18, 0(X)
+ LD $f19, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f20, 0(X)
+ LD $f21, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f22, 0(X)
+ LD $f23, SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD $f24, 0(X)
+ LD $f25, SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST $f10, 0(Y)
+ ST $f11, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f12, 0(Y)
+ ST $f13, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f14, 0(Y)
+ ST $f15, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f16, 0(Y)
+ ST $f17, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f18, 0(Y)
+ ST $f19, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f20, 0(Y)
+ ST $f21, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f22, 0(Y)
+ ST $f23, SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ST $f24, 0(Y)
+ ST $f25, SIZE(Y)
+ SXADDQ INCY, Y, Y
+#endif
+ subl $4, 1, $4
+ bgt $4, $SubMainLoop
+ .align 4
+
+$SubRemain:
+ ble $5, $SubEnd
+ .align 4
+
+ $SubRemainLoop:
+#ifndef COMPLEX
+ LD $f10, 0(X)
+ SXADDQ INCX, X, X
+ ST $f10, 0(Y)
+ SXADDQ INCY, Y, Y
+#else
+ LD $f10, 0(X)
+ LD $f11, SIZE(X)
+ SXADDQ INCX, X, X
+ ST $f10, 0(Y)
+ ST $f11, SIZE(Y)
+ SXADDQ INCY, Y, Y
+#endif
+ subl $5, 1, $5
+ bgt $5, $SubRemainLoop
+ .align 4
+
+$SubEnd:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/cscal.S b/kernel/sw_64/cscal.S
new file mode 100644
index 0000000..bba3137
--- /dev/null
+++ b/kernel/sw_64/cscal.S
@@ -0,0 +1,217 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+ .set noat
+ .set noreorder
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+ .globl NAME
+ .ent NAME
+
+NAME:
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ lda $28, _mcount
+ jsr $28, ($28), _mcount
+#endif
+
+#ifndef C_INTERFACE
+ ldl $16, 0($16) # n
+ mov $18, $20 # Store Address
+ ldl $19, 0($19) # incx
+ nop
+
+ LD $f1, 0($17) # alpha
+#else
+ mov $18, $20 # Store Address
+ fmov $f17, $f1 # alpha
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ sra $16, 1, $21 # 4-unrolling
+ ble $16, $End
+
+ lda $23, -1($19)
+ ble $19, $End
+
+ bgt $23, $INC_NOT_1
+ .align 4
+
+ ble $21, $Sub
+ lda $21, -1($21)
+ LD $f10, 0*SIZE($18)
+ LD $f11, 1*SIZE($18)
+
+ LD $f12, 2*SIZE($18)
+ LD $f13, 3*SIZE($18)
+ lda $18, 4*SIZE($18)
+ ble $21, $MainRemain
+ .align 4
+
+$MainLoop:
+ MUL $f10, $f1, $f20
+ LD $f10, 0*SIZE($18)
+ MUL $f11, $f1, $f21
+ LD $f11, 1*SIZE($18)
+
+ MUL $f12, $f1, $f22
+ LD $f12, 2*SIZE($18)
+ MUL $f13, $f1, $f23
+ LD $f13, 3*SIZE($18)
+
+ lda $18, 4*SIZE($18)
+ lda $21, -1($21)
+
+ ST $f20, 0*SIZE($20)
+ ST $f21, 1*SIZE($20)
+ ST $f22, 2*SIZE($20)
+ ST $f23, 3*SIZE($20)
+ lda $20, 4*SIZE($20)
+
+ bgt $21, $MainLoop
+ .align 4
+
+$MainRemain:
+ MUL $f10, $f1, $f20
+ MUL $f11, $f1, $f21
+ MUL $f12, $f1, $f22
+ MUL $f13, $f1, $f23
+
+ ST $f20, 0*SIZE($20)
+ ST $f21, 1*SIZE($20)
+ ST $f22, 2*SIZE($20)
+ ST $f23, 3*SIZE($20)
+ lda $20, 4*SIZE($20)
+ .align 4
+
+$Sub:
+ blbc $16, $End
+ LD $f10, 0*SIZE($18)
+ LD $f11, 1*SIZE($18)
+ MUL $f10, $f1, $f20
+ MUL $f11, $f1, $f21
+ ST $f20, 0*SIZE($20)
+ ST $f21, 1*SIZE($20)
+ .align 4
+
+$End:
+ ret
+ .align 4
+
+$INC_NOT_1:
+ addl $19, $19, $19
+ ble $21, $INC_Sub
+ lda $21, -1($21)
+
+ LD $f10, 0*SIZE($18)
+ LD $f11, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f12, 0*SIZE($18)
+ LD $f13, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+ ble $21, $INC_MainRemain
+ .align 4
+
+$INC_MainLoop:
+ MUL $f10, $f1, $f20
+ LD $f10, 0*SIZE($18)
+ MUL $f11, $f1, $f21
+ LD $f11, 1*SIZE($18)
+
+ SXADDQ $19, $18, $18
+
+ MUL $f12, $f1, $f22
+ LD $f12, 0*SIZE($18)
+ MUL $f13, $f1, $f23
+ LD $f13, 1*SIZE($18)
+
+ SXADDQ $19, $18, $18
+
+ ST $f20, 0*SIZE($20)
+ lda $21, -1($21)
+ ST $f21, 1*SIZE($20)
+ SXADDQ $19, $20, $20
+
+ ST $f22, 0*SIZE($20)
+ ST $f23, 1*SIZE($20)
+ SXADDQ $19, $20, $20
+ unop
+ bgt $21, $INC_MainLoop
+ .align 4
+
+$INC_MainRemain:
+ MUL $f10, $f1, $f20
+ MUL $f11, $f1, $f21
+ MUL $f12, $f1, $f22
+ MUL $f13, $f1, $f23
+
+ ST $f20, 0*SIZE($20)
+ ST $f21, 1*SIZE($20)
+ SXADDQ $19, $20, $20
+
+ ST $f22, 0*SIZE($20)
+ ST $f23, 1*SIZE($20)
+ SXADDQ $19, $20, $20
+ .align 4
+
+$INC_Sub:
+ blbc $16, $INC_End
+
+ LD $f10, 0*SIZE($18)
+ LD $f11, 1*SIZE($18)
+ MUL $f10, $f1, $f20
+ MUL $f11, $f1, $f21
+
+ ST $f20, 0*SIZE($20)
+ ST $f21, 1*SIZE($20)
+ .align 4
+
+$INC_End:
+ ret
+ .end NAME
+ .ident VERSION
diff --git a/kernel/sw_64/dnrm2.S b/kernel/sw_64/dnrm2.S
new file mode 100644
index 0000000..89cf787
--- /dev/null
+++ b/kernel/sw_64/dnrm2.S
@@ -0,0 +1,490 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCH_SIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+#if defined(EV4) || defined(EV5)
+ .frame $30,16,$26,0
+ .mask 0x4000000,-16
+ ldih $29, 0($27) !gpdisp!1
+ ldi $29, 0($29) !gpdisp!1
+
+ ldi $sp, -16($sp)
+ ldl $27, sqrt($29) !literal!2
+ stl $26, 0($sp)
+
+ PROFCODE
+ .prologue 1
+#else
+ PROFCODE
+#endif
+
+ fclr a0
+ SXADDQ INCX, 0, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, SIZE, $0
+ fclr a3
+ beq $0, $L20
+
+ fclr t0
+ sra N, 4, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ faddd a0, t0,$f24
+ fmov $f24,a0
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1,$f24
+ fmov $f24,a1
+ mov X, XX
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ #unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ #unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ #unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ #unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ #unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ #unop
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ #unop
+ fmuld x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ ldi X, 16 * SIZE(X)
+ fmuld x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ #unop
+ fmuld x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ #unop
+ fmuld x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ ldi I, -1(I)
+ fmuld x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ #unop
+ fmuld x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ fmuld x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ faddd a0, t0,$f24
+ fmov $f24,a0
+ mov X, XX
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ #unop
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ #unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ #unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ #unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ faddd a1, t1,$f24
+ fmov $f24,a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ faddd a2, t2,$f24
+ fmov $f24,a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ ldi X, 16 * SIZE(X)
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ fmuld x0, x0, t0
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ fmuld x1, x1, t1
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ fmuld x2, x2, t2
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ fmuld x4, x4, t0
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ fmuld x6, x6, t2
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ fmuld x7, x7, t3
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ .align 4
+
+$L15:
+ and N, 15, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ ldi X, 1 * SIZE(X)
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ fmuld x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L25
+
+ fclr t2
+ fclr t3
+
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x3, 0 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x5, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x6, 0 * SIZE(X)
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ ble I, $L22
+ .align 4
+
+$L21:
+ faddd a0, t0,$f24
+ fmov $f24,a0
+ LD x7, 0 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ LD x0, 0 * SIZE(X)
+ fmuld x1, x1, t1
+ addl X, INCX, X
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ LD x1, 0 * SIZE(X)
+ fmuld x2, x2, t2
+ addl X, INCX, X
+
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ LD x2, 0 * SIZE(X)
+ fmuld x3, x3, t3
+ addl X, INCX, X
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ LD x3, 0 * SIZE(X)
+ fmuld x4, x4, t0
+ addl X, INCX, X
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ LD x4, 0 * SIZE(X)
+ fmuld x5, x5, t1
+ addl X, INCX, X
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ LD x5, 0 * SIZE(X)
+ fmuld x6, x6, t2
+ addl X, INCX, X
+
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ LD x6, 0 * SIZE(X)
+ fmuld x7, x7, t3
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ bgt I, $L21
+ .align 4
+
+$L22:
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ LD x7, 0 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ unop
+ fmuld x1, x1, t1
+ unop
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ fmuld x2, x2, t2
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, $f24
+ fmov $f24,a0
+ fmuld x4, x4, t0
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ fmuld x6, x6, t2
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ fmuld x7, x7, t3
+
+ faddd a1, t1, $f24
+ fmov $f24,a1
+ faddd a2, t2, $f24
+ fmov $f24,a2
+ faddd a3, t3, $f24
+ fmov $f24,a3
+ .align 4
+
+$L25:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+
+ faddd a0, t0,$f24
+ fmov $f24,a0
+ fmuld x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ faddd a0, t0, $f24
+ fmov $f24,a0
+
+ faddd a0, a1, $f24
+ fmov $f24,a1
+ faddd a2, a3, $f24
+ fmov $f24,a2
+
+#if defined(EV4) || defined(EV5)
+ faddd a0, a2, $f16
+ jsr $26, ($27), sqrt !lituse_jsr!2
+
+ ldih $29, 0($26) !gpdisp!3
+ ldi $29, 0($29) !gpdisp!3
+#else
+ faddd a0, a2, $f24
+ fsqrtd $f24, a0
+#endif
+ .align 4
+
+$L999:
+#if defined(EV4) || defined(EV5)
+ ldl $26, 0($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/dnrm2.S.bak b/kernel/sw_64/dnrm2.S.bak
new file mode 100644
index 0000000..753c90b
--- /dev/null
+++ b/kernel/sw_64/dnrm2.S.bak
@@ -0,0 +1,431 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCH_SIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+#if defined(EV4) || defined(EV5)
+ .frame $30,16,$26,0
+ .mask 0x4000000,-16
+ ldih $29, 0($27) !gpdisp!1
+ ldi $29, 0($29) !gpdisp!1
+
+ ldi $sp, -16($sp)
+ ldl $27, sqrt($29) !literal!2
+ stq $26, 0($sp)
+
+ PROFCODE
+ .prologue 1
+#else
+ PROFCODE
+#endif
+
+ fclr a0
+ SXADDQ INCX, 0, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, SIZE, $0
+ fclr a3
+ beq $0, $L20
+
+ fclr t0
+ sra N, 4, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ faddd a0, t0, a0
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, a1
+ mov X, XX
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ faddd a1, t1, a1
+ ldi X, 16 * SIZE(X)
+ fmuld x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ faddd a1, t1, a1
+ ldi I, -1(I)
+ fmuld x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ faddd a0, t0, a0
+ mov X, XX
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ faddd a3, t3, a3
+ ldi X, 16 * SIZE(X)
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+
+ faddd a2, t2, a2
+ fmuld x2, x2, t2
+ faddd a3, t3, a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, a0
+ fmuld x4, x4, t0
+ faddd a1, t1, a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, a2
+ fmuld x6, x6, t2
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+
+ faddd a1, t1, a1
+ faddd a2, t2, a2
+ faddd a3, t3, a3
+ .align 4
+
+$L15:
+ and N, 15, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ ldi X, 1 * SIZE(X)
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L25
+
+ fclr t2
+ fclr t3
+
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x3, 0 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x5, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x6, 0 * SIZE(X)
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ ble I, $L22
+ .align 4
+
+$L21:
+ faddd a0, t0, a0
+ LD x7, 0 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ LD x0, 0 * SIZE(X)
+ fmuld x1, x1, t1
+ addl X, INCX, X
+
+ faddd a2, t2, a2
+ LD x1, 0 * SIZE(X)
+ fmuld x2, x2, t2
+ addl X, INCX, X
+
+ faddd a3, t3, a3
+ LD x2, 0 * SIZE(X)
+ fmuld x3, x3, t3
+ addl X, INCX, X
+
+ faddd a0, t0, a0
+ LD x3, 0 * SIZE(X)
+ fmuld x4, x4, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ LD x4, 0 * SIZE(X)
+ fmuld x5, x5, t1
+ addl X, INCX, X
+
+ faddd a2, t2, a2
+ LD x5, 0 * SIZE(X)
+ fmuld x6, x6, t2
+ addl X, INCX, X
+
+ faddd a3, t3, a3
+ LD x6, 0 * SIZE(X)
+ fmuld x7, x7, t3
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ bgt I, $L21
+ .align 4
+
+$L22:
+ faddd a0, t0, a0
+ LD x7, 0 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ unop
+ fmuld x1, x1, t1
+ unop
+
+ faddd a2, t2, a2
+ fmuld x2, x2, t2
+ faddd a3, t3, a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, a0
+ fmuld x4, x4, t0
+ faddd a1, t1, a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, a2
+ fmuld x6, x6, t2
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+
+ faddd a1, t1, a1
+ faddd a2, t2, a2
+ faddd a3, t3, a3
+ .align 4
+
+$L25:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ faddd a0, t0, a0
+
+ faddd a0, a1, a0
+ faddd a2, a3, a2
+
+#if defined(EV4) || defined(EV5)
+ faddd a0, a2, $f16
+ jsr $26, ($27), sqrt !lituse_jsr!2
+
+ ldih $29, 0($26) !gpdisp!3
+ ldi $29, 0($29) !gpdisp!3
+#else
+ faddd a0, a2, a0
+ fsqrtd a0, a0
+#endif
+ .align 4
+
+$L999:
+#if defined(EV4) || defined(EV5)
+ ldl $26, 0($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/dot.S b/kernel/sw_64/dot.S
new file mode 100644
index 0000000..513eada
--- /dev/null
+++ b/kernel/sw_64/dot.S
@@ -0,0 +1,607 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+
+#define I $5
+
+#define s0 $f0
+#define s1 $f30
+#define s2 $f1
+#define s3 $f2
+
+#define a0 $f10
+#define a1 $f11
+#define a2 $f12
+#define a3 $f13
+#define a4 $f14
+#define a5 $f15
+#define a6 $f16
+#define a7 $f17
+
+#define b0 $f18
+#define b1 $f19
+#define b2 $f20
+#define b3 $f21
+#define b4 $f22
+#define b5 $f23
+#define b6 $f24
+#define b7 $f25
+
+#define t0 $f26
+#define t1 $f27
+#define t2 $f28
+#define t3 $f29
+
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ ldi $sp, -16($sp)
+ fclr s0
+ fstd $f2, 0($sp)
+#ifndef ZYX20220111
+ fstd $f3, 8($sp)
+#endif
+ fclr s1
+
+ fclr s2
+ nop
+ fclr s3
+ ble N, $L999
+
+ fclr t0
+ cmpeq INCX, 1, $21
+ fclr t1
+ cmpeq INCY, 1, $22
+ fclr t2
+ and $21, $22, $22
+ fclr t3
+ beq $22, $L20
+
+#ifndef DOUBLE
+ srl N, 4, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+ LD b2, 2 * SIZE(Y)
+ LD b3, 3 * SIZE(Y)
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+ LD b4, 4 * SIZE(Y)
+ LD b5, 5 * SIZE(Y)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+ addl X, 16 * SIZE, X
+ subl I, 1, I
+
+ addl Y, 16 * SIZE, Y
+ ble I, $L13
+ .align 4
+
+$L12:
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
+ subl I, 1, I
+ fillcs PREFETCHSIZE * 2 * SIZE(Y)
+ addl X, 16 * SIZE, X
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -10 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -9 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a0, -24 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -23 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b0, -8 * SIZE(Y)
+ MUL a2, b2, $f3
+ fmov $f3, t2
+ LD b1, -7 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a2, -22 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -21 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b2, -6 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, -5 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a4, -20 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -19 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b4, -4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, -3 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a6, -18 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -17 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a0, -16 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -15 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, 1 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a2, -14 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -13 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b2, 2 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 3 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a4, -12 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -11 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b4, 4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, 5 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a6, -10 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -9 * SIZE(X)
+
+ addl Y, 16 * SIZE, Y
+ bgt I, $L12
+ nop
+ fnop
+ .align 4
+
+$L13:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6,-10 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -9 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a0, -8 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -7 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b0, -8 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, -7 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a2, -6 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -5 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b2, -6 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, -5 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a4, -4 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -3 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b4, -4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, -3 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a6, -2 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -1 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a1, b1, t1
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a2, b2, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ MUL a4, b4, t0
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a5, b5, t1
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a6, b6, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a7, b7, t3
+ .align 4
+
+$L15:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ and N, 15, I
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ ble I, $L18
+ .align 4
+
+#else
+
+ srl N, 3, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+ LD b2, 2 * SIZE(Y)
+ LD b3, 3 * SIZE(Y)
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+ LD b4, 4 * SIZE(Y)
+ LD b5, 5 * SIZE(Y)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+ addl X, 8 * SIZE, X
+ subl I, 1, I
+
+ addl Y, 8 * SIZE, Y
+ ble I, $L13
+ .align 4
+
+$L12:
+ fillcs PREFETCHSIZE * SIZE(X)
+ subl I, 1, I
+ fillcs PREFETCHSIZE * SIZE(Y)
+ addl X, 8 * SIZE, X
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a0, -8 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -7 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, 1 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a2, -6 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -5 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b2, 2 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 3 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a4, -4 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -3 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b4, 4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, 5 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a6, -2 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -1 * SIZE(X)
+
+ addl Y, 8 * SIZE, Y
+ bgt I, $L12
+ nop
+ fnop
+ .align 4
+
+$L13:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a1, b1, t1
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a2, b2, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ MUL a4, b4, t0
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a5, b5, t1
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a6, b6, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a7, b7, t3
+ .align 4
+
+$L15:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ and N, 7, I
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ ble I, $L18
+ .align 4
+
+#endif
+
+$L16:
+ LD a0, 0 * SIZE(X)
+ addl X, SIZE, X
+ LD b0, 0 * SIZE(Y)
+ addl Y, SIZE, Y
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a0, b0, t2
+ subl I, 1, I
+ bgt I, $L16
+ .align 4
+
+$L18:
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ br $L999
+ .align 4
+
+$L20:
+ srl N, 2, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b1, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b2, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b3, 0 * SIZE(Y)
+ subl I, 1, I
+
+ SXADDQ INCY, Y, Y
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ MUL a0, b0, t0
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a1, b1, t1
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a2, b2, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a3, b3, t3
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b1, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b2, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b3, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ subl I, 1, I
+ bgt I, $L22
+ nop
+ fnop
+ .align 4
+
+$L23:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ MUL a0, b0, t0
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a1, b1, t1
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a2, b2, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a3, b3, t3
+ .align 4
+
+$L25:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ and N, 3, I
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ ble I, $L28
+ .align 4
+
+$L26:
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a0, b0, t2
+ subl I, 1, I
+ bgt I, $L26
+ .align 4
+
+$L28:
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ .align 4
+
+$L999:
+ ADD s2, s3, $f3
+ fmov $f3, s2
+ fldd $f2, 0($sp)
+ ADD s0, s1, $f3
+ fmov $f3, s0
+ ADD s0, s2, $f3
+ fmov $f3, s0
+#ifndef ZYX20220111
+ fldd $f3, 8($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/dot.S.bak b/kernel/sw_64/dot.S.bak
new file mode 100644
index 0000000..cd96e21
--- /dev/null
+++ b/kernel/sw_64/dot.S.bak
@@ -0,0 +1,602 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+
+#define I $5
+
+#define s0 $f0
+#define s1 $f30
+#define s2 $f1
+#define s3 $f2
+
+#define a0 $f10
+#define a1 $f11
+#define a2 $f12
+#define a3 $f13
+#define a4 $f14
+#define a5 $f15
+#define a6 $f16
+#define a7 $f17
+
+#define b0 $f18
+#define b1 $f19
+#define b2 $f20
+#define b3 $f21
+#define b4 $f22
+#define b5 $f23
+#define b6 $f24
+#define b7 $f25
+
+#define t0 $f26
+#define t1 $f27
+#define t2 $f28
+#define t3 $f29
+
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ ldi $sp, -16($sp)
+ fclr s0
+ fstd $f2, 0($sp)
+ fclr s1
+
+ fclr s2
+ nop
+ fclr s3
+ ble N, $L999
+
+ fclr t0
+ cmpeq INCX, 1, $21
+ fclr t1
+ cmpeq INCY, 1, $22
+ fclr t2
+ and $21, $22, $22
+ fclr t3
+ beq $22, $L20
+
+#ifndef DOUBLE
+ srl N, 4, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+ LD b2, 2 * SIZE(Y)
+ LD b3, 3 * SIZE(Y)
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+ LD b4, 4 * SIZE(Y)
+ LD b5, 5 * SIZE(Y)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+ addl X, 16 * SIZE, X
+ subl I, 1, I
+
+ addl Y, 16 * SIZE, Y
+ ble I, $L13
+ .align 4
+
+$L12:
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
+ subl I, 1, I
+ fillcs PREFETCHSIZE * 2 * SIZE(Y)
+ addl X, 16 * SIZE, X
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -10 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -9 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a0, -24 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -23 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b0, -8 * SIZE(Y)
+ MUL a2, b2, $f3
+ fmov $f3, t2
+ LD b1, -7 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a2, -22 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -21 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b2, -6 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, -5 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a4, -20 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -19 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b4, -4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, -3 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a6, -18 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -17 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a0, -16 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -15 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, 1 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a2, -14 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -13 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b2, 2 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 3 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a4, -12 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -11 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b4, 4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, 5 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a6, -10 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -9 * SIZE(X)
+
+ addl Y, 16 * SIZE, Y
+ bgt I, $L12
+ nop
+ fnop
+ .align 4
+
+$L13:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6,-10 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -9 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a0, -8 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -7 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b0, -8 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, -7 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a2, -6 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -5 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b2, -6 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, -5 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a4, -4 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -3 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b4, -4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, -3 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a6, -2 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -1 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a1, b1, t1
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a2, b2, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ MUL a4, b4, t0
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a5, b5, t1
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a6, b6, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a7, b7, t3
+ .align 4
+
+$L15:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ and N, 15, I
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ ble I, $L18
+ .align 4
+
+#else
+
+ srl N, 3, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+ LD b2, 2 * SIZE(Y)
+ LD b3, 3 * SIZE(Y)
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+ LD b4, 4 * SIZE(Y)
+ LD b5, 5 * SIZE(Y)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+ addl X, 8 * SIZE, X
+ subl I, 1, I
+
+ addl Y, 8 * SIZE, Y
+ ble I, $L13
+ .align 4
+
+$L12:
+ fillcs PREFETCHSIZE * SIZE(X)
+ subl I, 1, I
+ fillcs PREFETCHSIZE * SIZE(Y)
+ addl X, 8 * SIZE, X
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a0, -8 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -7 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, 1 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a2, -6 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -5 * SIZE(X)
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b2, 2 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 3 * SIZE(Y)
+
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ LD a4, -4 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -3 * SIZE(X)
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ LD b4, 4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, 5 * SIZE(Y)
+
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ LD a6, -2 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -1 * SIZE(X)
+
+ addl Y, 8 * SIZE, Y
+ bgt I, $L12
+ nop
+ fnop
+ .align 4
+
+$L13:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a1, b1, t1
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a2, b2, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ MUL a4, b4, t0
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a5, b5, t1
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a6, b6, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a7, b7, t3
+ .align 4
+
+$L15:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ and N, 7, I
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ ble I, $L18
+ .align 4
+
+#endif
+
+$L16:
+ LD a0, 0 * SIZE(X)
+ addl X, SIZE, X
+ LD b0, 0 * SIZE(Y)
+ addl Y, SIZE, Y
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a0, b0, t2
+ subl I, 1, I
+ bgt I, $L16
+ .align 4
+
+$L18:
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ br $L999
+ .align 4
+
+$L20:
+ srl N, 2, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b1, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b2, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b3, 0 * SIZE(Y)
+ subl I, 1, I
+
+ SXADDQ INCY, Y, Y
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ MUL a0, b0, t0
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a1, b1, t1
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a2, b2, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a3, b3, t3
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b1, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b2, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b3, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ subl I, 1, I
+ bgt I, $L22
+ nop
+ fnop
+ .align 4
+
+$L23:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ MUL a0, b0, t0
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ MUL a1, b1, t1
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a2, b2, t2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ MUL a3, b3, t3
+ .align 4
+
+$L25:
+ ADD s0, t0, $f3
+ fmov $f3, s0
+ and N, 3, I
+ ADD s1, t1, $f3
+ fmov $f3, s1
+ ble I, $L28
+ .align 4
+
+$L26:
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ MUL a0, b0, t2
+ subl I, 1, I
+ bgt I, $L26
+ .align 4
+
+$L28:
+ ADD s2, t2, $f3
+ fmov $f3, s2
+ ADD s3, t3, $f3
+ fmov $f3, s3
+ .align 4
+
+$L999:
+ ADD s2, s3, $f3
+ fmov $f3, s2
+ fldd $f2, 0($sp)
+ ADD s0, s1, $f3
+ fmov $f3, s0
+ ldi $sp, 16($sp)
+
+ ADD s0, s2, $f3
+ fmov $f3, s0
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/dot_simd.S b/kernel/sw_64/dot_simd.S
new file mode 100644
index 0000000..3e2288d
--- /dev/null
+++ b/kernel/sw_64/dot_simd.S
@@ -0,0 +1,634 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+
+#define I $5
+
+#define s0 $f0
+#define s1 $f30
+#define s2 $f1
+#define s3 $f2
+
+#define a0 $f10
+#define a1 $f11
+#define a2 $f12
+#define a3 $f13
+#define a4 $f14
+#define a5 $f15
+#define a6 $f16
+#define a7 $f17
+
+#define b0 $f18
+#define b1 $f19
+#define b2 $f20
+#define b3 $f21
+#define b4 $f22
+#define b5 $f23
+#define b6 $f24
+#define b7 $f25
+
+#define t0 $f26
+#define t1 $f27
+#define t2 $f28
+#define t3 $f29
+
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ ldi $sp, -16($sp)
+ fclr s0
+ fstd $f2, 0($sp)
+ fclr s1
+
+ fclr s2
+ nop
+ fclr s3
+ ble N, $L999
+
+ fclr t0
+ cmpeq INCX, 1, $21
+ fclr t1
+ cmpeq INCY, 1, $22
+ fclr t2
+ and $21, $22, $22
+ fclr t3
+ beq $22, $L20
+
+
+/*
+ test the address of Y & X
+*/
+ and Y, (VEC_LEN*SIZE-1), $4
+ and X, (VEC_LEN*SIZE-1), $3
+ or $3, $4, $4
+ bne $4, $UnAlign_ACCESS
+
+/*Align Accessing*/
+ sra N, 4, I
+ ble I, $Remain
+
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, s0 #clear s0 vector
+ VLD a1, 1*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, s1
+
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, s2
+ VLD a3, 3*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, s3
+
+ VLD b0, 0*VEC_LEN*SIZE(Y)
+ VLD b1, 1*VEC_LEN*SIZE(Y)
+ VLD b2, 2*VEC_LEN*SIZE(Y)
+ VLD b3, 3*VEC_LEN*SIZE(Y)
+
+ addl X, 16 * SIZE, X
+ addl Y, 16 * SIZE, Y
+ subl I, 1, I
+ ble I, $MainLoopEnd
+$MainLoop:
+ VMAD a0, b0, s0, s0
+ fillcs PREFETCHSIZE * SIZE(X)
+ VMAD a1, b1, s1, s1
+ fillcs PREFETCHSIZE * SIZE(Y)
+
+ subl I, 1, I
+ VMAD a2, b2, s2, s2
+ addl X, 16 * SIZE, X
+ VMAD a3, b3, s3, s3
+
+ VLD a0, -4*VEC_LEN*SIZE(X)
+ VLD a1, -3*VEC_LEN*SIZE(X)
+ VLD a2, -2*VEC_LEN*SIZE(X)
+ VLD a3, -1*VEC_LEN*SIZE(X)
+
+ VLD b0, 0*VEC_LEN*SIZE(Y)
+ VLD b1, 1*VEC_LEN*SIZE(Y)
+ VLD b2, 2*VEC_LEN*SIZE(Y)
+ VLD b3, 3*VEC_LEN*SIZE(Y)
+
+
+ addl Y, 16 * SIZE, Y
+ bgt I, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+ VMAD a0, b0, s0, s0
+ VMAD a1, b1, s1, s1
+ VMAD a2, b2, s2, s2
+ VMAD a3, b3, s3, s3
+
+ VADD s0, s1, t0
+ VADD s2, s3, t1
+ nop
+ VADD t0, t1, s0
+
+ vextf s0, 1, s1
+ vextf s0, 2, s2
+ vextf s0, 3, s3
+ nop
+
+ ADD s0, s1, t2
+ ADD s2, s3, t3
+ nop
+ ADD t2, t3, s0
+
+ .align 4
+$Remain:
+ and N, 15, I
+ ble I, $End
+ .align 4
+$Remain_Loop:
+ LD a0, 0 * SIZE(X)
+ addl X, SIZE, X
+ LD b0, 0 * SIZE(Y)
+ addl Y, SIZE, Y
+
+ MAD a0, b0, s0, s0
+ subl I, 1, I
+ bgt I, $Remain_Loop
+ .align 4
+$End:
+
+ fldd $f2, 0($sp)
+ ldi $sp, 16($sp)
+ ret
+ .align 4
+
+/*UnAlign Accessing*/
+$UnAlign_ACCESS:
+
+#ifndef DOUBLE
+ srl N, 4, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+ LD b2, 2 * SIZE(Y)
+ LD b3, 3 * SIZE(Y)
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+ LD b4, 4 * SIZE(Y)
+ LD b5, 5 * SIZE(Y)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+ addl X, 16 * SIZE, X
+ subl I, 1, I
+
+ addl Y, 16 * SIZE, Y
+ ble I, $L13
+ .align 4
+
+$L12:
+ fillcs PREFETCHSIZE * 2 * SIZE(X)
+ subl I, 1, I
+ fillcs PREFETCHSIZE * 2 * SIZE(Y)
+ addl X, 16 * SIZE, X
+
+ ADD s0, t0, s0
+ LD b6, -10 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -9 * SIZE(Y)
+
+ ADD s1, t1, s1
+ LD a0, -24 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -23 * SIZE(X)
+
+ ADD s2, t2, s2
+ LD b0, -8 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, -7 * SIZE(Y)
+
+ ADD s3, t3, s3
+ LD a2, -22 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -21 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, -6 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, -5 * SIZE(Y)
+
+ ADD s1, t1, s1
+ LD a4, -20 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -19 * SIZE(X)
+
+ ADD s2, t2, s2
+ LD b4, -4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, -3 * SIZE(Y)
+
+ ADD s3, t3, s3
+ LD a6, -18 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -17 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ LD a0, -16 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -15 * SIZE(X)
+
+ ADD s2, t2, s2
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, 1 * SIZE(Y)
+
+ ADD s3, t3, s3
+ LD a2, -14 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -13 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, 2 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 3 * SIZE(Y)
+
+ ADD s1, t1, s1
+ LD a4, -12 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -11 * SIZE(X)
+
+ ADD s2, t2, s2
+ LD b4, 4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, 5 * SIZE(Y)
+
+ ADD s3, t3, s3
+ LD a6, -10 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -9 * SIZE(X)
+
+ addl Y, 16 * SIZE, Y
+ bgt I, $L12
+ nop
+ fnop
+ .align 4
+
+$L13:
+ ADD s0, t0, s0
+ LD b6,-10 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -9 * SIZE(Y)
+
+ ADD s1, t1, s1
+ LD a0, -8 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -7 * SIZE(X)
+
+ ADD s2, t2, s2
+ LD b0, -8 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, -7 * SIZE(Y)
+
+ ADD s3, t3, s3
+ LD a2, -6 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -5 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, -6 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, -5 * SIZE(Y)
+
+ ADD s1, t1, s1
+ LD a4, -4 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -3 * SIZE(X)
+
+ ADD s2, t2, s2
+ LD b4, -4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, -3 * SIZE(Y)
+
+ ADD s3, t3, s3
+ LD a6, -2 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+ ADD s1, t1, s1
+ MUL a1, b1, t1
+
+ ADD s2, t2, s2
+ MUL a2, b2, t2
+ ADD s3, t3, s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, s0
+ MUL a4, b4, t0
+ ADD s1, t1, s1
+ MUL a5, b5, t1
+ ADD s2, t2, s2
+ MUL a6, b6, t2
+ ADD s3, t3, s3
+ MUL a7, b7, t3
+ .align 4
+
+$L15:
+ ADD s0, t0, s0
+ and N, 15, I
+ ADD s1, t1, s1
+ ble I, $L18
+ .align 4
+
+#else
+
+ srl N, 3, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+ LD b2, 2 * SIZE(Y)
+ LD b3, 3 * SIZE(Y)
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+ LD b4, 4 * SIZE(Y)
+ LD b5, 5 * SIZE(Y)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+ addl X, 8 * SIZE, X
+ subl I, 1, I
+
+ addl Y, 8 * SIZE, Y
+ ble I, $L13
+ .align 4
+
+$L12:
+ fillcs PREFETCHSIZE * SIZE(X)
+ subl I, 1, I
+ fillcs PREFETCHSIZE * SIZE(Y)
+ addl X, 8 * SIZE, X
+
+ ADD s0, t0, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ LD a0, -8 * SIZE(X)
+ MUL a1, b1, t1
+ LD a1, -7 * SIZE(X)
+
+ ADD s2, t2, s2
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t2
+ LD b1, 1 * SIZE(Y)
+
+ ADD s3, t3, s3
+ LD a2, -6 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, -5 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, 2 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 3 * SIZE(Y)
+
+ ADD s1, t1, s1
+ LD a4, -4 * SIZE(X)
+ MUL a5, b5, t1
+ LD a5, -3 * SIZE(X)
+
+ ADD s2, t2, s2
+ LD b4, 4 * SIZE(Y)
+ MUL a6, b6, t2
+ LD b5, 5 * SIZE(Y)
+
+ ADD s3, t3, s3
+ LD a6, -2 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, -1 * SIZE(X)
+
+ addl Y, 8 * SIZE, Y
+ bgt I, $L12
+ nop
+ fnop
+ .align 4
+
+$L13:
+ ADD s0, t0, s0
+ LD b6, -2 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, -1 * SIZE(Y)
+ ADD s1, t1, s1
+ MUL a1, b1, t1
+
+ ADD s2, t2, s2
+ MUL a2, b2, t2
+ ADD s3, t3, s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, s0
+ MUL a4, b4, t0
+ ADD s1, t1, s1
+ MUL a5, b5, t1
+ ADD s2, t2, s2
+ MUL a6, b6, t2
+ ADD s3, t3, s3
+ MUL a7, b7, t3
+ .align 4
+
+$L15:
+ ADD s0, t0, s0
+ and N, 7, I
+ ADD s1, t1, s1
+ ble I, $L18
+ .align 4
+
+#endif
+
+$L16:
+ LD a0, 0 * SIZE(X)
+ addl X, SIZE, X
+ LD b0, 0 * SIZE(Y)
+ addl Y, SIZE, Y
+
+ ADD s2, t2, s2
+ MUL a0, b0, t2
+ subl I, 1, I
+ bgt I, $L16
+ .align 4
+
+$L18:
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+ br $L999
+ .align 4
+
+$L20:
+ srl N, 2, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b1, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b2, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b3, 0 * SIZE(Y)
+ subl I, 1, I
+
+ SXADDQ INCY, Y, Y
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD s0, t0, s0
+ MUL a0, b0, t0
+ ADD s1, t1, s1
+ MUL a1, b1, t1
+ ADD s2, t2, s2
+ MUL a2, b2, t2
+ ADD s3, t3, s3
+ MUL a3, b3, t3
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b1, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b2, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b3, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ subl I, 1, I
+ bgt I, $L22
+ nop
+ fnop
+ .align 4
+
+$L23:
+ ADD s0, t0, s0
+ MUL a0, b0, t0
+ ADD s1, t1, s1
+ MUL a1, b1, t1
+ ADD s2, t2, s2
+ MUL a2, b2, t2
+ ADD s3, t3, s3
+ MUL a3, b3, t3
+ .align 4
+
+$L25:
+ ADD s0, t0, s0
+ and N, 3, I
+ ADD s1, t1, s1
+ ble I, $L28
+ .align 4
+
+$L26:
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD b0, 0 * SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ MUL a0, b0, t2
+ subl I, 1, I
+ bgt I, $L26
+ .align 4
+
+$L28:
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+ .align 4
+
+$L999:
+ ADD s2, s3, s2
+ fldd $f2, 0($sp)
+ ADD s0, s1, s0
+ ldi $sp, 16($sp)
+
+ ADD s0, s2, s0
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/gemm_beta.S b/kernel/sw_64/gemm_beta.S
new file mode 100644
index 0000000..d9ea890
--- /dev/null
+++ b/kernel/sw_64/gemm_beta.S
@@ -0,0 +1,179 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+ .set noat
+ .set noreorder
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+CNAME:
+ .frame $sp, 0, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $28, _mcount
+ jsr $28, ($28), _mcount
+#endif
+
+ ldl $18, 16($sp)
+ ble $16, $End
+ ldl $19, 24($sp)
+ ble $17, $End
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO)
+ .align 4
+
+$BETA_NE_ZERO:
+ sra $16, 3, $2 # i = (m >> 3)
+ mov $18, $1 # c_offset = c
+ ldi $17, -1($17) # j --
+ ble $2,$L52
+ .align 4
+
+$L51:
+ fillcs 64($1)
+ ldi $2, -1($2)
+
+ LD $f14, 0*SIZE($1)
+ LD $f15, 1*SIZE($1)
+ LD $f16, 2*SIZE($1)
+ LD $f17, 3*SIZE($1)
+ LD $f18, 4*SIZE($1)
+ LD $f11, 5*SIZE($1)
+ LD $f21, 6*SIZE($1)
+ LD $f22, 7*SIZE($1)
+
+ MUL $f19, $f14, $f23
+ MUL $f19, $f15, $f24
+ MUL $f19, $f16, $f25
+ MUL $f19, $f17, $f26
+ MUL $f19, $f18, $f27
+ MUL $f19, $f11, $f28
+ MUL $f19, $f21, $f29
+ MUL $f19, $f22, $f30
+
+ ST $f23, 0*SIZE($1)
+ ST $f24, 1*SIZE($1)
+ ST $f25, 2*SIZE($1)
+ ST $f26, 3*SIZE($1)
+ ST $f27, 4*SIZE($1)
+ ST $f28, 5*SIZE($1)
+ ST $f29, 6*SIZE($1)
+ ST $f30, 7*SIZE($1)
+
+ ldi $1,8*SIZE($1)
+ bgt $2,$L51
+ .align 4
+
+$L52:
+ and $16, 7, $2
+ ble $2,$L54
+ .align 4
+
+$L53:
+ LD $f12, 0($1)
+ ldi $2, -1($2)
+ MUL $f19, $f12, $f23
+ ST $f23, 0($1)
+ ldi $1, SIZE($1)
+ bgt $2,$L53
+ .align 4
+
+$L54:
+ SXADDQ $19, $18, $18 # c += ldc
+ bgt $17,$BETA_NE_ZERO
+ clr $0
+ ret
+ .align 4
+
+$BETA_EQ_ZERO:
+ sra $16, 3, $2 # i = (m >> 3)
+ ldi $4, 8*SIZE($18)
+ mov $18, $1 # c_offset = c
+ ldi $17, -1($17) # j --
+ ble $2,$L42
+ .align 4
+
+$L41:
+ ST $f31, 0*SIZE($1)
+ ST $f31, 1*SIZE($1)
+ ST $f31, 2*SIZE($1)
+ ST $f31, 3*SIZE($1)
+ ST $f31, 4*SIZE($1)
+ ST $f31, 5*SIZE($1)
+ ST $f31, 6*SIZE($1)
+ ST $f31, 7*SIZE($1)
+ ldi $2, -1($2)
+
+ ldi $4, 8*SIZE($4)
+ ldi $1, 8*SIZE($1)
+ bgt $2,$L41
+ .align 4
+
+$L42:
+ and $16, 7, $2
+ ble $2,$L44
+ .align 4
+
+$L43:
+ ldi $2, -1($2)
+ ST $f31, 0($1)
+ ldi $1, SIZE($1)
+ bgt $2, $L43
+ .align 4
+
+$L44:
+ SXADDQ $19, $18, $18 # c += ldc
+ bgt $17,$BETA_EQ_ZERO
+ clr $0
+ .align 4
+
+$End:
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/gemm_kernel_4x4.S b/kernel/sw_64/gemm_kernel_4x4.S
new file mode 100644
index 0000000..dd17554
--- /dev/null
+++ b/kernel/sw_64/gemm_kernel_4x4.S
@@ -0,0 +1,3244 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 56
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+#define STACKSIZE 96
+
+#define M $16
+#define N $17
+#define K $18
+#define A $20
+#define B $21
+#define C $22
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+#define C3 $25
+#define C4 $27
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define BB $3
+#define OFFSET $4
+
+#define tmp $9
+
+#define ALPHA 64($sp)
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl C, 0 + STACKSIZE($sp)
+ ldl LDC, 8 + STACKSIZE($sp)
+#ifdef TRMMKERNEL
+ ldl OFFSET, 16 + STACKSIZE($sp)
+#endif
+
+ SXADDQ LDC, 0, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ stl $9, 80($sp)
+ fstd $f19, ALPHA
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ subl $31, OFFSET, KK
+#endif
+
+ sra N, 2, J
+ ble J, $L40
+ .align 4
+
+$L01:
+ mov C, C1
+ addl C, LDC, C2
+ mov A, AO
+ s4addl K, 0, BB
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ addl C2, LDC, C3
+ s4addl LDC, C, C
+
+ SXADDQ BB, B, BB
+ fclr t1
+ addl C3, LDC, C4
+ fclr t2
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(EV5) || defined(SW6A)
+ fillcs 0 * SIZE(BB)
+ fillcs 8 * SIZE(BB)
+ unop
+ ldi BB, 16 * SIZE(BB)
+#endif
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 4, TMP1
+#else
+ addl KK, 4, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+
+ LD b3, 2 * SIZE(B)
+ fclr c06
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(B)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+#else
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+
+ LD b3, 2 * SIZE(BO)
+ fclr c06
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(TMP1)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(BO)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+#endif
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble L, $L15
+ .align 5
+
+$L12:
+/* 1 */
+ ADD c11, t1, b5
+ fmov b5, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD c16, t3,b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD c15, t4,b5
+ fmov b5, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+ FIMOVD b5, tmp
+/* 2 */
+ ADD c01, t1,b5
+ fmov b5, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD c02, t2,b5
+ fmov b5, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD c06, t3,b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD c03, t1,b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2,b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3,b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4,b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD c09, t1,b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD c11, t1, b5
+ fmov b5, c11
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ ldi L, -2(L)
+ IFMOVD tmp, b5
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a6, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a4, t2
+ unop
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD c03, t1, b5
+ fmov b5, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD c11, t1, b5
+ fmov b5, c11
+ fldd alpha, ALPHA
+ MUL b1, a1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L18
+#else
+ blbs TMP1, $L18
+#endif
+ .align 4
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, t2
+ ADD c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, t3
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, t3
+ ADD c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, t4
+
+ ADD c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD c11, t1, b5
+ fmov b5, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L18:
+ ADD c12, t2, b5
+ fmov b5, c12
+ unop
+ MUL b1, a2, t2
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a1, t4
+#ifndef TRMMKERNEL
+ LD b5, 1 * SIZE(C1)
+ FIMOVD b5, tmp
+#else
+ unop
+#endif
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL b1, a3, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL b1, a4, t2
+#ifndef TRMMKERNEL
+ LD b1, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+ ADD c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+#ifndef TRMMKERNEL
+ LD a1, 0 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+#ifndef TRMMKERNEL
+ LD a2, 2 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+#ifndef TRMMKERNEL
+ LD b2, 3 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ ldi I, -1(I)
+ MUL b3, a3, t1
+ unop
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+#ifndef TRMMKERNEL
+ LD b3, 0 * SIZE(C4)
+#else
+ unop
+#endif
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+#ifndef TRMMKERNEL
+ LD a4, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+#ifndef TRMMKERNEL
+ LD a3, 2 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c11, t1, b5
+ fmov b5, c11
+ unop
+ MUL alpha, c01, b5
+ fmov b5, c01
+#ifndef TRMMKERNEL
+ LD b4, 3 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ unop
+ MUL alpha, c02, b5
+ fmov b5, c02
+#ifndef TRMMKERNEL
+ LD t1, 1 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL alpha, c03, b5
+ fmov b5, c03
+#ifndef TRMMKERNEL
+ LD t2, 2 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL alpha, c04, b5
+ fmov b5, c04
+#ifndef TRMMKERNEL
+ LD t3, 3 * SIZE(C3)
+#else
+ unop
+#endif
+
+ MUL alpha, c05, b5
+ fmov b5, c05
+ unop
+#ifndef TRMMKERNEL
+ ADD c01, a5, b5
+ fmov b5, c01
+ LD t4, 1 * SIZE(C4)
+#else
+ unop
+ unop
+#endif
+
+ MUL alpha, c06, b5
+ fmov b5, c06
+#ifndef TRMMKERNEL
+ unop
+ IFMOVD tmp, b5
+ fstd b1, 88($sp)
+# FIMOVD b1, tmp
+ ADD c02, b5, b1
+ fmov b1, c02
+ fldd b1, 88($sp)
+# IFMOVD tmp, b1
+ LD a5, 2 * SIZE(C4)
+#endif
+
+ MUL alpha, c07, b5
+ fmov b5, c07
+#ifndef TRMMKERNEL
+ unop
+ ADD c03, a2, b5
+ fmov b5, c03
+ LD b5, 3 * SIZE(C4)
+ FIMOVD b5, tmp
+#endif
+
+ MUL alpha, c08, b5
+ fmov b5, c08
+#ifndef TRMMKERNEL
+ unop
+ ADD c04, b2, b5
+ fmov b5, c04
+ unop
+#endif
+
+ MUL alpha, c09, b5
+ fmov b5, c09
+ ST c01, 0 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c05, b1, b5
+ fmov b5, c05
+ unop
+#endif
+
+ MUL alpha, c10, b5
+ fmov b5, c10
+ ST c02, 1 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c06, a4, b5
+ fmov b5, c06
+ unop
+#endif
+
+ MUL alpha, c11, b5
+ fmov b5, c11
+ ST c03, 2 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c07, a3, b5
+ fmov b5, c07
+ unop
+#endif
+
+ MUL alpha, c12, b5
+ fmov b5, c12
+ ST c04, 3 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c08, b4, b5
+ fmov b5, c08
+#else
+ unop
+#endif
+ ldi C1, 4 * SIZE(C1)
+
+ MUL alpha, c13, b5
+ fmov b5, c13
+ ST c05, 0 * SIZE(C2)
+#ifndef TRMMKERNEL
+ ADD c09, a1, b5
+ fmov b5, c09
+ unop
+#endif
+
+ MUL alpha, c14, b5
+ fmov b5, c14
+ ST c06, 1 * SIZE(C2)
+#ifndef TRMMKERNEL
+ ADD c10, t1, b5
+ fmov b5, c10
+ unop
+#endif
+
+ MUL alpha, c15, b5
+ fmov b5, c15
+ ST c07, 2 * SIZE(C2)
+#ifndef TRMMKERNEL
+ ADD c11, t2, b5
+ fmov b5, c11
+ unop
+#endif
+
+ MUL alpha, c16, b5
+ fmov b5, c16
+ ST c08, 3 * SIZE(C2)
+#ifndef TRMMKERNEL
+ ADD c12, t3, b5
+ fmov b5, c12
+#else
+ unop
+#endif
+ ldi C2, 4 * SIZE(C2)
+
+#ifndef TRMMKERNEL
+ ADD c13, b3, b5
+ fmov b5, c13
+#else
+ unop
+#endif
+ ST c09, 0 * SIZE(C3)
+ fclr t1
+ ldi C4, 4 * SIZE(C4)
+
+#ifndef TRMMKERNEL
+ ADD c14, t4, b5
+ fmov b5, c14
+#else
+ unop
+#endif
+ ST c10, 1 * SIZE(C3)
+ fclr t2
+ unop
+
+#ifndef TRMMKERNEL
+ ADD c15, a5, b5
+ fmov b5, c15
+#else
+ unop
+#endif
+ ST c11, 2 * SIZE(C3)
+ fclr t3
+ unop
+
+#ifndef TRMMKERNEL
+ IFMOVD tmp, b5
+# FIMOVD b1, tmp
+ fstd b1, 88($sp)
+ ADD c16, b5, b1
+ fmov b1, c16
+ fldd b1, 88($sp)
+# IFMOVD tmp, b1
+#else
+ unop
+#endif
+ ST c12, 3 * SIZE(C3)
+ fclr t4
+ ldi C3, 4 * SIZE(C3)
+
+ ST c13, -4 * SIZE(C4)
+ ST c14, -3 * SIZE(C4)
+ ST c15, -2 * SIZE(C4)
+ ST c16, -1 * SIZE(C4)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 4, TMP1
+#else
+ subl TMP1, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 4, KK
+#endif
+
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 2, I
+ ble I, $L30
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 4, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(B)
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c01
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ ldi BO, 4 * SIZE(B)
+ fclr c02
+ fclr c06
+ ble L, $L25
+
+#else
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c01
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c02
+ fclr c06
+ ble L, $L25
+#endif
+ .align 4
+
+$L22:
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+ FIMOVD b5, tmp
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ IFMOVD tmp, b5
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ IFMOVD tmp, b5
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD c09, t1, b5
+ fmov b5, c09
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L28
+#else
+ blbs TMP1, $L28
+#endif
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L28:
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+#ifndef TRMMKERNEL
+ LD a3, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+#ifndef TRMMKERNEL
+ LD a4, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+#ifndef TRMMKERNEL
+ LD b5, 1 * SIZE(C2)
+ FIMOVD b5, tmp
+#else
+ unop
+#endif
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+#ifndef TRMMKERNEL
+ LD b1, 0 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+#ifndef TRMMKERNEL
+ LD b2, 1 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b4, t4
+#ifndef TRMMKERNEL
+ LD b3, 0 * SIZE(C4)
+#else
+ unop
+#endif
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL alpha, c01, b5
+ fmov b5, c01
+#ifndef TRMMKERNEL
+ LD b4, 1 * SIZE(C4)
+#else
+ unop
+#endif
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL alpha, c02, b5
+ fmov b5, c02
+ unop
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ MUL alpha, c05, b5
+ fmov b5, c05
+ ADD c14, t4, b5
+ fmov b5, c14
+ MUL alpha, c06, b5
+ fmov b5, c06
+
+ MUL alpha, c09, b5
+ fmov b5, c09
+#ifndef TRMMKERNEL
+ ADD c01, a3, b5
+ fmov b5, c01
+#endif
+ MUL alpha, c10, b5
+ fmov b5, c10
+#ifndef TRMMKERNEL
+ ADD c02, a4, b5
+ fmov b5, c02
+#endif
+
+ MUL alpha, c13, b5
+ fmov b5, c13
+#ifndef TRMMKERNEL
+ ADD c05, a5, b5
+ fmov b5, c05
+#endif
+ MUL alpha, c14, b5
+ fmov b5, c14
+#ifndef TRMMKERNEL
+ IFMOVD tmp, b5
+ fstd b1, 88($sp)
+# FIMOVD b1, tmp
+ ADD c06, b5, b1
+ fmov b1, c06
+ fldd b1, 88($sp)
+# IFMOVD tmp, b1
+#endif
+
+#ifndef TRMMKERNEL
+ ADD c09, b1, b5
+ fmov b5, c09
+ unop
+#endif
+ ST c01, 0 * SIZE(C1)
+ fclr t1
+
+#ifndef TRMMKERNEL
+ ADD c10, b2, b5
+ fmov b5, c10
+ unop
+#endif
+ ST c02, 1 * SIZE(C1)
+ fclr t2
+
+#ifndef TRMMKERNEL
+ ADD c13, b3, b5
+ fmov b5, c13
+ unop
+#endif
+ ST c05, 0 * SIZE(C2)
+ fclr t3
+
+#ifndef TRMMKERNEL
+ ADD c14, b4, b5
+ fmov b5, c14
+ unop
+#endif
+ ST c06, 1 * SIZE(C2)
+ fclr t4
+
+ ST c09, 0 * SIZE(C3)
+ ldi C1, 2 * SIZE(C1)
+ ST c10, 1 * SIZE(C3)
+ ldi C2, 2 * SIZE(C2)
+
+ ST c13, 0 * SIZE(C4)
+ ldi C3, 2 * SIZE(C3)
+ ST c14, 1 * SIZE(C4)
+ ldi C4, 2 * SIZE(C4)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+ .align 4
+
+$L30:
+ and M, 1, I
+ ble I, $L39
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 4, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ LD b2, 1 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c09
+ LD b4, 3 * SIZE(B)
+ fclr c13
+
+ ldi BO, 4 * SIZE(B)
+ ble L, $L35
+#else
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c09
+ LD b4, 3 * SIZE(BO)
+ fclr c13
+
+ ldi BO, 4 * SIZE(BO)
+ ble L, $L35
+#endif
+ .align 4
+
+$L32:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ ldi AO, 2 * SIZE(AO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ LD b5, 3 * SIZE(BO)
+ FIMOVD b5, tmp
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL a1, b4, t4
+ LD a1, -1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a2, b1, t1
+ LD b1, 4 * SIZE(BO)
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a2, b2, t2
+ LD b2, -3 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ LD b4, -1 * SIZE(BO)
+ MUL a2, b3, t3
+ LD b3, -2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ IFMOVD tmp, b5
+ MUL a2, b5, t4
+ LD a2, 0 * SIZE(AO)
+ bgt L, $L32
+ .align 4
+
+$L35:
+ ADD c01, t1, b5
+ fmov b5, c01
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L38
+#else
+ blbs TMP1, $L38
+#endif
+ .align 4
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ LD b1, 0 * SIZE(BO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL a1, b4, t4
+ LD a1, 0 * SIZE(AO)
+ ldi AO, 1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L38:
+ ADD c05, t2, b5
+ fmov b5, c05
+ unop
+ MUL a1, b2, t2
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ unop
+ MUL a1, b3, t3
+#ifndef TRMMKERNEL
+ LD b5, 0 * SIZE(C2)
+ FIMOVD b5, tmp
+#else
+ unop
+#endif
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL a1, b4, t4
+#ifndef TRMMKERNEL
+ LD a2, 0 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL alpha, c01, b5
+ fmov b5, c01
+#ifndef TRMMKERNEL
+ LD a3, 0 * SIZE(C4)
+#else
+ unop
+#endif
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ unop
+ MUL alpha, c05, b5
+ fmov b5, c05
+ unop
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ MUL alpha, c09, b5
+ fmov b5, c09
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL alpha, c13, b5
+ fmov b5, c13
+
+#ifndef TRMMKERNEL
+ IFMOVD tmp, b5
+ fstd b1, 88($sp)
+# FIMOVD b1, tmp
+ ADD c01, a5, b1
+ fmov b1, c01
+ ADD c05, b5, b1
+ fmov b1, c05
+ ADD c09, a2, b1
+ fmov b1, c09
+ ADD c13, a3, b1
+ fmov b1, c13
+ fldd b1, 88($sp)
+# IFMOVD tmp, b1
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c09, 0 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 1, TMP1
+#else
+ subl TMP1, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 1, KK
+#endif
+ .align 4
+
+$L39:
+ mov BO, B
+ ldi J, -1(J)
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 4, KK
+#else
+ unop
+#endif
+ bgt J, $L01
+ .align 4
+
+$L40:
+ and N, 2, J
+ ble J, $L80
+
+ mov C, C1
+ addl C, LDC, C2
+ mov A, AO
+ fclr t1
+ addl C2, LDC, C
+ fclr t2
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L60
+ .align 4
+
+$L51:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 4, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ ldi BO, 2 * SIZE(B)
+ ldi AO, 4 * SIZE(AO)
+ ble L, $L55
+#else
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+ ldi BO, 2 * SIZE(BO)
+ ldi AO, 4 * SIZE(AO)
+ ble L, $L55
+#endif
+ .align 4
+
+$L52:
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD c05, t1, b5
+ fmov b5, c05
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L58
+#else
+ blbs TMP1, $L58
+#endif
+ .align 4
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, t2
+ ADD c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, t3
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L58:
+ ADD c06, t2, b5
+ fmov b5, c06
+ unop
+ MUL a2, b1, t2
+#ifndef TRMMKERNEL
+ LD c09, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b1, t3
+#ifndef TRMMKERNEL
+ LD c10, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, t4
+#ifndef TRMMKERNEL
+ LD c11, 2 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, t1
+#ifndef TRMMKERNEL
+ LD c12, 3 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, t2
+#ifndef TRMMKERNEL
+ LD c13, 0 * SIZE(C2)
+ unop
+#endif
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, t3
+#ifndef TRMMKERNEL
+ LD c14, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ unop
+ MUL a4, b2, t4
+#ifndef TRMMKERNEL
+ LD c15, 2 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL alpha, c01, b5
+ fmov b5, c01
+#ifndef TRMMKERNEL
+ LD c16, 3 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ ldi I, -1(I)
+ MUL alpha, c02, b5
+ fmov b5, c02
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ MUL alpha, c03, b5
+ fmov b5, c03
+ ADD c08, t4, b5
+ fmov b5, c08
+ MUL alpha, c04, b5
+ fmov b5, c04
+
+ MUL alpha, c05, b5
+ fmov b5, c05
+#ifndef TRMMKERNEL
+ ADD c01, c09, b5
+ fmov b5, c01
+#endif
+ MUL alpha, c06, b5
+ fmov b5, c06
+#ifndef TRMMKERNEL
+ ADD c02, c10, b5
+ fmov b5, c02
+#endif
+
+ MUL alpha, c07, b5
+ fmov b5, c07
+#ifndef TRMMKERNEL
+ ADD c03, c11, b5
+ fmov b5, c03
+#endif
+ MUL alpha, c08, b5
+ fmov b5, c08
+#ifndef TRMMKERNEL
+ ADD c04, c12, b5
+ fmov b5, c04
+#endif
+
+#ifndef TRMMKERNEL
+ ADD c05, c13, b5
+ fmov b5, c05
+#endif
+ ST c01, 0 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c06, c14, b5
+ fmov b5, c06
+#endif
+ ST c02, 1 * SIZE(C1)
+
+#ifndef TRMMKERNEL
+ ADD c07, c15, b5
+ fmov b5, c07
+#endif
+ ST c03, 2 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c08, c16, b5
+ fmov b5, c08
+#endif
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ fclr t1
+ ST c06, 1 * SIZE(C2)
+ fclr t2
+ ST c07, 2 * SIZE(C2)
+ fclr t3
+ ST c08, 3 * SIZE(C2)
+ fclr t4
+
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 4, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 4, KK
+#endif
+ bgt I, $L51
+ .align 4
+
+$L60:
+ and M, 2, I
+ ble I, $L70
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+ ble L, $L65
+#else
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+ ble L, $L65
+#endif
+ .align 4
+
+$L62:
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L62
+ .align 4
+
+$L65:
+ ADD c01, t1, b5
+ fmov b5, c01
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L68
+#else
+ blbs TMP1, $L68
+#endif
+ .align 4
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L68:
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b1, t2
+#ifndef TRMMKERNEL
+ LD c09, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b2, t3
+#ifndef TRMMKERNEL
+ LD c10, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, t4
+#ifndef TRMMKERNEL
+ LD c11, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL alpha, c01, b5
+ fmov b5, c01
+#ifndef TRMMKERNEL
+ LD c12, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi C1, 2 * SIZE(C1)
+ MUL alpha, c02, b5
+ fmov b5, c02
+ ldi C2, 2 * SIZE(C2)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ MUL alpha, c05, b5
+ fmov b5, c05
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL alpha, c06, b5
+ fmov b5, c06
+
+#ifndef TRMMKERNEL
+ ADD c01, c09, b5
+ fmov b5, c01
+ ADD c02, c10, b5
+ fmov b5, c02
+ ADD c05, c11, b5
+ fmov b5, c05
+ ADD c06, c12, b5
+ fmov b5, c06
+#endif
+
+ ST c01, -2 * SIZE(C1)
+ fclr t1
+ ST c02, -1 * SIZE(C1)
+ fclr t2
+ ST c05, -2 * SIZE(C2)
+ fclr t3
+ ST c06, -1 * SIZE(C2)
+ fclr t4
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+ .align 4
+
+$L70:
+ and M, 1, I
+ ble I, $L79
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ fclr c02
+ LD b2, 1 * SIZE(B)
+ fclr c06
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+ ble L, $L75
+#else
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ fclr c02
+ LD b2, 1 * SIZE(BO)
+ fclr c06
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+ ble L, $L75
+#endif
+ .align 4
+
+$L72:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, t2
+ LD a1, 1 * SIZE(AO)
+ LD b2, 3 * SIZE(BO)
+
+ ADD c02, t3, b5
+ fmov b5, c02
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b3, t3
+ LD b3, 4 * SIZE(BO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, t4
+ LD a2, 0 * SIZE(AO)
+ LD b4, 5 * SIZE(BO)
+
+ ldi BO, 4 * SIZE(BO)
+ unop
+ unop
+ bgt L, $L72
+ .align 4
+
+$L75:
+ ADD c01, t1, b5
+ fmov b5, c01
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L78
+#else
+ blbs TMP1, $L78
+#endif
+ .align 4
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, t2
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L78:
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, t2
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c02, t3, b5
+ fmov b5, c02
+ ADD c06, t4, b5
+ fmov b5, c06
+#ifndef TRMMKERNEL
+ LD b5, 0 * SIZE(C2)
+ FIMOVD b5, tmp
+#else
+ unop
+#endif
+
+ ADD c01, c02, b5
+ fmov b5, c01
+ ADD c05, c06, b5
+ fmov b5, c05
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c05, t2, b5
+ fmov b5, c05
+
+ MUL alpha, c01, b5
+ fmov b5, c01
+ MUL alpha, c05, b5
+ fmov b5, c05
+
+#ifndef TRMMKERNEL
+ IFMOVD tmp ,b5
+ fstd b1, 88($sp)
+# FIMOVD b1, tmp
+ ADD c01, a5, b1
+ fmov b1, c01
+ ADD c05, b5, b1
+ fmov b1, c05
+ fldd b1, 88($sp)
+# IFMOVD tmp ,b1
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 1, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 1, KK
+#endif
+ .align 4
+
+$L79:
+ mov BO, B
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 2, KK
+#else
+ unop
+#endif
+ unop
+ unop
+ .align 4
+
+$L80:
+ and N, 1, J
+ ble J, $L999
+
+ mov C, C1
+ mov A, AO
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ sra M, 2, I
+ ble I, $L100
+ .align 4
+
+$L91:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 4, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ mov B, BO
+ unop
+ ble L, $L95
+#else
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ unop
+ ble L, $L95
+#endif
+ .align 5
+
+$L92:
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi L, -1(L)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 8 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 9 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 10 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, t4
+ LD a4, 11 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ LD a1, 12 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD a2, 13 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b3, t3
+ LD a3, 14 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b3, t4
+ LD a5, 15 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b4, t1
+ LD a1, 16 * SIZE(AO)
+ ldi AO, 16 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b4, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L92
+ .align 4
+
+$L95:
+#ifndef TRMMKERNEL
+ and K, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ fldd alpha, ALPHA
+ unop
+ ble L, $L98
+ .align 4
+
+$L96:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 1 * SIZE(BO)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ldi AO, 4 * SIZE(AO)
+ bgt L, $L96
+ .align 4
+
+$L98:
+#ifndef TRMMKERNEL
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD c05, 0 * SIZE(C1)
+ ADD c02, t2, b5
+ fmov b5, c02
+ LD c06, 1 * SIZE(C1)
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD c07, 2 * SIZE(C1)
+ ADD c04, t4, b5
+ fmov b5, c04
+ LD c08, 3 * SIZE(C1)
+#else
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+#endif
+
+ MUL alpha, c01, b5
+ fmov b5, c01
+ MUL alpha, c02, b5
+ fmov b5, c02
+ MUL alpha, c03, b5
+ fmov b5, c03
+ MUL alpha, c04, b5
+ fmov b5, c04
+
+#ifndef TRMMKERNEL
+ ADD c01, c05, b5
+ fmov b5, c01
+ ADD c02, c06, b5
+ fmov b5, c02
+ ADD c03, c07, b5
+ fmov b5, c03
+ ADD c04, c08, b5
+ fmov b5, c04
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ldi C1, 4 * SIZE(C1)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 4, TMP1
+#else
+ subl TMP1, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L91
+ .align 4
+
+$L100:
+ and M, 2, I
+ unop
+ unop
+ ble I, $L110
+ .align 4
+
+$L101:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ mov B, BO
+ unop
+ ble L, $L105
+#else
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ unop
+ ble L, $L105
+#endif
+ .align 5
+
+$L102:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ ldi BO, 4 * SIZE(BO)
+ MUL a3, b2, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, t4
+ LD a5, 7 * SIZE(AO)
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, t1
+ LD a1, 8 * SIZE(AO)
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L102
+ .align 4
+
+$L105:
+#ifndef TRMMKERNEL
+ and K, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ fldd alpha, ALPHA
+#ifndef TRMMKERNEL
+ LD a3, 0 * SIZE(C1)
+ LD a4, 1 * SIZE(C1)
+#endif
+ ble L, $L108
+ .align 4
+
+$L106:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 2 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, t2
+ LD a2, 3 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi AO, 2 * SIZE(AO)
+ unop
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L106
+ .align 4
+
+$L108:
+ ADD c01, t1, b5
+ fmov b5, c01
+ fclr t1
+ ADD c02, t2, b5
+ fmov b5, c02
+ fclr t2
+ ADD c03, t3, b5
+ fmov b5, c03
+ fclr t3
+ ADD c04, t4, b5
+ fmov b5, c04
+ fclr t4
+
+ ADD c01, c03, b5
+ fmov b5, c01
+ ADD c02, c04, b5
+ fmov b5, c02
+
+ MUL alpha, c01, b5
+ fmov b5, c01
+ MUL alpha, c02, b5
+ fmov b5, c02
+
+#ifndef TRMMKERNEL
+ ADD c01, a3, b5
+ fmov b5, c01
+ ADD c02, a4, b5
+ fmov b5, c02
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ldi C1, 2 * SIZE(C1)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+ .align 4
+
+$L110:
+ and M, 1, I
+ ble I, $L999
+ .align 4
+
+$L111:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ mov B, BO
+ unop
+ ble L, $L115
+#else
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ unop
+ ble L, $L115
+#endif
+ .align 4
+
+$L112:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, t2
+ LD a2, 5 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ MUL a3, b3, t3
+ LD a3, 6 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b4, t4
+ LD a4, 7 * SIZE(AO)
+ LD b4, 7 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 4 * SIZE(AO)
+ ldi BO, 4 * SIZE(BO)
+ bgt L, $L112
+ .align 4
+
+$L115:
+#ifndef TRMMKERNEL
+ and K, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ fldd alpha, ALPHA
+#ifndef TRMMKERNEL
+ LD a2, 0 * SIZE(C1)
+#endif
+ ble L, $L118
+ .align 4
+
+$L116:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, t1
+ LD a1, 1 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 1 * SIZE(AO)
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L116
+ .align 4
+
+$L118:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+ ADD c01, c02, b5
+ fmov b5, c01
+ ADD c03, c04, b5
+ fmov b5, c03
+ ADD c01, c03, b5
+ fmov b5, c01
+
+ MUL alpha, c01, b5
+ fmov b5, c01
+#ifndef TRMMKERNEL
+ ADD c01, a2, b5
+ fmov b5, c01
+#endif
+ ST c01, 0 * SIZE(C1)
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl $9, 80($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/gemm_kernel_4x4.S.bak b/kernel/sw_64/gemm_kernel_4x4.S.bak
new file mode 100644
index 0000000..10dc98d
--- /dev/null
+++ b/kernel/sw_64/gemm_kernel_4x4.S.bak
@@ -0,0 +1,2844 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW2B
+#define PREFETCHSIZE 56
+#define UNOP nop
+#endif
+
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $20
+#define B $21
+#define C $22
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+#define C3 $25
+#define C4 $27
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define BB $3
+#define OFFSET $4
+
+#define ALPHA 64($sp)
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl C, 0 + STACKSIZE($sp)
+ ldl LDC, 8 + STACKSIZE($sp)
+#ifdef TRMMKERNEL
+ ldl OFFSET, 16 + STACKSIZE($sp)
+#endif
+
+ SXADDQ LDC, 0, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ fstd $f19, ALPHA
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ subl $31, OFFSET, KK
+#endif
+
+ sra N, 2, J
+ ble J, $L40
+ .align 4
+
+$L01:
+ mov C, C1
+ addl C, LDC, C2
+ mov A, AO
+ s4addl K, 0, BB
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ addl C2, LDC, C3
+ s4addl LDC, C, C
+
+ SXADDQ BB, B, BB
+ fclr t1
+ addl C3, LDC, C4
+ fclr t2
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(EV5) || defined(EV6) || defined(SW2B)
+ fillcs 0 * SIZE(BB)
+ fillcs 8 * SIZE(BB)
+ unop
+ ldi BB, 16 * SIZE(BB)
+#endif
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 4, TMP1
+#else
+ addl KK, 4, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+
+ LD b3, 2 * SIZE(B)
+ fclr c06
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(B)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+#else
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+
+ LD b3, 2 * SIZE(BO)
+ fclr c06
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(TMP1)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(BO)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+#endif
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble L, $L15
+ .align 5
+
+$L12:
+/* 1 */
+ ADD c11, t1, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD c01, t1, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD c02, t2, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD c11, t1, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c12, t2, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD c01, t1, c01
+ unop
+ MUL b5, a6, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD c03, t1, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD c04, t2, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD c09, t1, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD c11, t1, c11
+ fldd alpha, ALPHA
+ MUL b1, a1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L18
+#else
+ blbs TMP1, $L18
+#endif
+ .align 4
+
+ ADD c12, t2, c12
+ MUL b1, a2, t2
+ ADD c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD c15, t4, c15
+ MUL b2, a1, t4
+ ADD c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD c02, t2, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c06, t3, c06
+ MUL b2, a4, t3
+ ADD c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD c11, t1, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L18:
+ ADD c12, t2, c12
+ unop
+ MUL b1, a2, t2
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a1, t4
+#ifndef TRMMKERNEL
+ LD b5, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c01, t1, c01
+ unop
+ MUL b1, a3, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL b1, a4, t2
+#ifndef TRMMKERNEL
+ LD b1, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+#ifndef TRMMKERNEL
+ LD a1, 0 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+#ifndef TRMMKERNEL
+ LD a2, 2 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+#ifndef TRMMKERNEL
+ LD b2, 3 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c09, t1, c09
+ ldi I, -1(I)
+ MUL b3, a3, t1
+ unop
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+#ifndef TRMMKERNEL
+ LD b3, 0 * SIZE(C4)
+#else
+ unop
+#endif
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+#ifndef TRMMKERNEL
+ LD a4, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+#ifndef TRMMKERNEL
+ LD a3, 2 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c11, t1, c11
+ unop
+ MUL alpha, c01, c01
+#ifndef TRMMKERNEL
+ LD b4, 3 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c12, t2, c12
+ unop
+ MUL alpha, c02, c02
+#ifndef TRMMKERNEL
+ LD t1, 1 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c16, t3, c16
+ unop
+ MUL alpha, c03, c03
+#ifndef TRMMKERNEL
+ LD t2, 2 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c15, t4, c15
+ unop
+ MUL alpha, c04, c04
+#ifndef TRMMKERNEL
+ LD t3, 3 * SIZE(C3)
+#else
+ unop
+#endif
+
+ MUL alpha, c05, c05
+ unop
+#ifndef TRMMKERNEL
+ ADD c01, a5, c01
+ LD t4, 1 * SIZE(C4)
+#else
+ unop
+ unop
+#endif
+
+ MUL alpha, c06, c06
+#ifndef TRMMKERNEL
+ unop
+ ADD c02, b5, c02
+ LD a5, 2 * SIZE(C4)
+#endif
+
+ MUL alpha, c07, c07
+#ifndef TRMMKERNEL
+ unop
+ ADD c03, a2, c03
+ LD b5, 3 * SIZE(C4)
+#endif
+
+ MUL alpha, c08, c08
+#ifndef TRMMKERNEL
+ unop
+ ADD c04, b2, c04
+ unop
+#endif
+
+ MUL alpha, c09, c09
+ ST c01, 0 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c05, b1, c05
+ unop
+#endif
+
+ MUL alpha, c10, c10
+ ST c02, 1 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c06, a4, c06
+ unop
+#endif
+
+ MUL alpha, c11, c11
+ ST c03, 2 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c07, a3, c07
+ unop
+#endif
+
+ MUL alpha, c12, c12
+ ST c04, 3 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c08, b4, c08
+#else
+ unop
+#endif
+ ldi C1, 4 * SIZE(C1)
+
+ MUL alpha, c13, c13
+ ST c05, 0 * SIZE(C2)
+#ifndef TRMMKERNEL
+ ADD c09, a1, c09
+ unop
+#endif
+
+ MUL alpha, c14, c14
+ ST c06, 1 * SIZE(C2)
+#ifndef TRMMKERNEL
+ ADD c10, t1, c10
+ unop
+#endif
+
+ MUL alpha, c15, c15
+ ST c07, 2 * SIZE(C2)
+#ifndef TRMMKERNEL
+ ADD c11, t2, c11
+ unop
+#endif
+
+ MUL alpha, c16, c16
+ ST c08, 3 * SIZE(C2)
+#ifndef TRMMKERNEL
+ ADD c12, t3, c12
+#else
+ unop
+#endif
+ ldi C2, 4 * SIZE(C2)
+
+#ifndef TRMMKERNEL
+ ADD c13, b3, c13
+#else
+ unop
+#endif
+ ST c09, 0 * SIZE(C3)
+ fclr t1
+ ldi C4, 4 * SIZE(C4)
+
+#ifndef TRMMKERNEL
+ ADD c14, t4, c14
+#else
+ unop
+#endif
+ ST c10, 1 * SIZE(C3)
+ fclr t2
+ unop
+
+#ifndef TRMMKERNEL
+ ADD c15, a5, c15
+#else
+ unop
+#endif
+ ST c11, 2 * SIZE(C3)
+ fclr t3
+ unop
+
+#ifndef TRMMKERNEL
+ ADD c16, b5, c16
+#else
+ unop
+#endif
+ ST c12, 3 * SIZE(C3)
+ fclr t4
+ ldi C3, 4 * SIZE(C3)
+
+ ST c13, -4 * SIZE(C4)
+ ST c14, -3 * SIZE(C4)
+ ST c15, -2 * SIZE(C4)
+ ST c16, -1 * SIZE(C4)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 4, TMP1
+#else
+ subl TMP1, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 4, KK
+#endif
+
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 2, I
+ ble I, $L30
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 4, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(B)
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c01
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ ldi BO, 4 * SIZE(B)
+ fclr c02
+ fclr c06
+ ble L, $L25
+
+#else
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c01
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c02
+ fclr c06
+ ble L, $L25
+#endif
+ .align 4
+
+$L22:
+ ADD c09, t1, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD c09, t1, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c14, t4, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD c09, t1, c09
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L28
+#else
+ blbs TMP1, $L28
+#endif
+
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c09, t1, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L28:
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+#ifndef TRMMKERNEL
+ LD a3, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+#ifndef TRMMKERNEL
+ LD a4, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+#ifndef TRMMKERNEL
+ LD b5, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+#ifndef TRMMKERNEL
+ LD b1, 0 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+#ifndef TRMMKERNEL
+ LD b2, 1 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b4, t4
+#ifndef TRMMKERNEL
+ LD b3, 0 * SIZE(C4)
+#else
+ unop
+#endif
+
+ ADD c09, t1, c09
+ unop
+ MUL alpha, c01, c01
+#ifndef TRMMKERNEL
+ LD b4, 1 * SIZE(C4)
+#else
+ unop
+#endif
+
+ ADD c10, t2, c10
+ unop
+ MUL alpha, c02, c02
+ unop
+
+ ADD c13, t3, c13
+ MUL alpha, c05, c05
+ ADD c14, t4, c14
+ MUL alpha, c06, c06
+
+ MUL alpha, c09, c09
+#ifndef TRMMKERNEL
+ ADD c01, a3, c01
+#endif
+ MUL alpha, c10, c10
+#ifndef TRMMKERNEL
+ ADD c02, a4, c02
+#endif
+
+ MUL alpha, c13, c13
+#ifndef TRMMKERNEL
+ ADD c05, a5, c05
+#endif
+ MUL alpha, c14, c14
+#ifndef TRMMKERNEL
+ ADD c06, b5, c06
+#endif
+
+#ifndef TRMMKERNEL
+ ADD c09, b1, c09
+ unop
+#endif
+ ST c01, 0 * SIZE(C1)
+ fclr t1
+
+#ifndef TRMMKERNEL
+ ADD c10, b2, c10
+ unop
+#endif
+ ST c02, 1 * SIZE(C1)
+ fclr t2
+
+#ifndef TRMMKERNEL
+ ADD c13, b3, c13
+ unop
+#endif
+ ST c05, 0 * SIZE(C2)
+ fclr t3
+
+#ifndef TRMMKERNEL
+ ADD c14, b4, c14
+ unop
+#endif
+ ST c06, 1 * SIZE(C2)
+ fclr t4
+
+ ST c09, 0 * SIZE(C3)
+ ldi C1, 2 * SIZE(C1)
+ ST c10, 1 * SIZE(C3)
+ ldi C2, 2 * SIZE(C2)
+
+ ST c13, 0 * SIZE(C4)
+ ldi C3, 2 * SIZE(C3)
+ ST c14, 1 * SIZE(C4)
+ ldi C4, 2 * SIZE(C4)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+ .align 4
+
+$L30:
+ and M, 1, I
+ ble I, $L39
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 4, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ LD b2, 1 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c09
+ LD b4, 3 * SIZE(B)
+ fclr c13
+
+ ldi BO, 4 * SIZE(B)
+ ble L, $L35
+#else
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c09
+ LD b4, 3 * SIZE(BO)
+ fclr c13
+
+ ldi BO, 4 * SIZE(BO)
+ ble L, $L35
+#endif
+ .align 4
+
+$L32:
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t2, c05
+ ldi AO, 2 * SIZE(AO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, c09
+ LD b5, 3 * SIZE(BO)
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a1, b4, t4
+ LD a1, -1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ MUL a2, b1, t1
+ LD b1, 4 * SIZE(BO)
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c05, t2, c05
+ MUL a2, b2, t2
+ LD b2, -3 * SIZE(BO)
+
+ ADD c09, t3, c09
+ LD b4, -1 * SIZE(BO)
+ MUL a2, b3, t3
+ LD b3, -2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a2, b5, t4
+ LD a2, 0 * SIZE(AO)
+ bgt L, $L32
+ .align 4
+
+$L35:
+ ADD c01, t1, c01
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L38
+#else
+ blbs TMP1, $L38
+#endif
+ .align 4
+
+ ADD c05, t2, c05
+ LD b1, 0 * SIZE(BO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, c09
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a1, b4, t4
+ LD a1, 0 * SIZE(AO)
+ ldi AO, 1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L38:
+ ADD c05, t2, c05
+ unop
+ MUL a1, b2, t2
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c09, t3, c09
+ unop
+ MUL a1, b3, t3
+#ifndef TRMMKERNEL
+ LD b5, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c13, t4, c13
+ unop
+ MUL a1, b4, t4
+#ifndef TRMMKERNEL
+ LD a2, 0 * SIZE(C3)
+#else
+ unop
+#endif
+
+ ADD c01, t1, c01
+ unop
+ MUL alpha, c01, c01
+#ifndef TRMMKERNEL
+ LD a3, 0 * SIZE(C4)
+#else
+ unop
+#endif
+
+ ADD c05, t2, c05
+ unop
+ MUL alpha, c05, c05
+ unop
+
+ ADD c09, t3, c09
+ MUL alpha, c09, c09
+ ADD c13, t4, c13
+ MUL alpha, c13, c13
+
+#ifndef TRMMKERNEL
+ ADD c01, a5, c01
+ ADD c05, b5, c05
+ ADD c09, a2, c09
+ ADD c13, a3, c13
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c09, 0 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 1, TMP1
+#else
+ subl TMP1, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 1, KK
+#endif
+ .align 4
+
+$L39:
+ mov BO, B
+ ldi J, -1(J)
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 4, KK
+#else
+ unop
+#endif
+ bgt J, $L01
+ .align 4
+
+$L40:
+ and N, 2, J
+ ble J, $L80
+
+ mov C, C1
+ addl C, LDC, C2
+ mov A, AO
+ fclr t1
+ addl C2, LDC, C
+ fclr t2
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L60
+ .align 4
+
+$L51:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 4, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ ldi BO, 2 * SIZE(B)
+ ldi AO, 4 * SIZE(AO)
+ ble L, $L55
+#else
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+ ldi BO, 2 * SIZE(BO)
+ ldi AO, 4 * SIZE(AO)
+ ble L, $L55
+#endif
+ .align 4
+
+$L52:
+ ADD c05, t1, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c06, t2, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD c05, t1, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD c06, t2, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c08, t4, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD c05, t1, c05
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L58
+#else
+ blbs TMP1, $L58
+#endif
+ .align 4
+
+ ADD c06, t2, c06
+ MUL a2, b1, t2
+ ADD c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c05, t1, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L58:
+ ADD c06, t2, c06
+ unop
+ MUL a2, b1, t2
+#ifndef TRMMKERNEL
+ LD c09, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b1, t3
+#ifndef TRMMKERNEL
+ LD c10, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+#ifndef TRMMKERNEL
+ LD c11, 2 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+#ifndef TRMMKERNEL
+ LD c12, 3 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+#ifndef TRMMKERNEL
+ LD c13, 0 * SIZE(C2)
+ unop
+#endif
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+#ifndef TRMMKERNEL
+ LD c14, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c04, t4, c04
+ unop
+ MUL a4, b2, t4
+#ifndef TRMMKERNEL
+ LD c15, 2 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c05, t1, c05
+ unop
+ MUL alpha, c01, c01
+#ifndef TRMMKERNEL
+ LD c16, 3 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c06, t2, c06
+ ldi I, -1(I)
+ MUL alpha, c02, c02
+ unop
+
+ ADD c07, t3, c07
+ MUL alpha, c03, c03
+ ADD c08, t4, c08
+ MUL alpha, c04, c04
+
+ MUL alpha, c05, c05
+#ifndef TRMMKERNEL
+ ADD c01, c09, c01
+#endif
+ MUL alpha, c06, c06
+#ifndef TRMMKERNEL
+ ADD c02, c10, c02
+#endif
+
+ MUL alpha, c07, c07
+#ifndef TRMMKERNEL
+ ADD c03, c11, c03
+#endif
+ MUL alpha, c08, c08
+#ifndef TRMMKERNEL
+ ADD c04, c12, c04
+#endif
+
+#ifndef TRMMKERNEL
+ ADD c05, c13, c05
+#endif
+ ST c01, 0 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c06, c14, c06
+#endif
+ ST c02, 1 * SIZE(C1)
+
+#ifndef TRMMKERNEL
+ ADD c07, c15, c07
+#endif
+ ST c03, 2 * SIZE(C1)
+#ifndef TRMMKERNEL
+ ADD c08, c16, c08
+#endif
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ fclr t1
+ ST c06, 1 * SIZE(C2)
+ fclr t2
+ ST c07, 2 * SIZE(C2)
+ fclr t3
+ ST c08, 3 * SIZE(C2)
+ fclr t4
+
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 4, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 4, KK
+#endif
+ bgt I, $L51
+ .align 4
+
+$L60:
+ and M, 2, I
+ ble I, $L70
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+ ble L, $L65
+#else
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+ ble L, $L65
+#endif
+ .align 4
+
+$L62:
+ ADD c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c02, t2, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L62
+ .align 4
+
+$L65:
+ ADD c01, t1, c01
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L68
+#else
+ blbs TMP1, $L68
+#endif
+ .align 4
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t3, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L68:
+ ADD c02, t2, c02
+ unop
+ MUL a2, b1, t2
+#ifndef TRMMKERNEL
+ LD c09, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b2, t3
+#ifndef TRMMKERNEL
+ LD c10, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+#ifndef TRMMKERNEL
+ LD c11, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c01, t1, c01
+ unop
+ MUL alpha, c01, c01
+#ifndef TRMMKERNEL
+ LD c12, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c02, t2, c02
+ ldi C1, 2 * SIZE(C1)
+ MUL alpha, c02, c02
+ ldi C2, 2 * SIZE(C2)
+
+ ADD c05, t3, c05
+ MUL alpha, c05, c05
+ ADD c06, t4, c06
+ MUL alpha, c06, c06
+
+#ifndef TRMMKERNEL
+ ADD c01, c09, c01
+ ADD c02, c10, c02
+ ADD c05, c11, c05
+ ADD c06, c12, c06
+#endif
+
+ ST c01, -2 * SIZE(C1)
+ fclr t1
+ ST c02, -1 * SIZE(C1)
+ fclr t2
+ ST c05, -2 * SIZE(C2)
+ fclr t3
+ ST c06, -1 * SIZE(C2)
+ fclr t4
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+ .align 4
+
+$L70:
+ and M, 1, I
+ ble I, $L79
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ fclr c02
+ LD b2, 1 * SIZE(B)
+ fclr c06
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+ ble L, $L75
+#else
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ fclr c02
+ LD b2, 1 * SIZE(BO)
+ fclr c06
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+ ble L, $L75
+#endif
+ .align 4
+
+$L72:
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ LD a1, 1 * SIZE(AO)
+ LD b2, 3 * SIZE(BO)
+
+ ADD c02, t3, c02
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b3, t3
+ LD b3, 4 * SIZE(BO)
+
+ ADD c06, t4, c06
+ MUL a2, b4, t4
+ LD a2, 0 * SIZE(AO)
+ LD b4, 5 * SIZE(BO)
+
+ ldi BO, 4 * SIZE(BO)
+ unop
+ unop
+ bgt L, $L72
+ .align 4
+
+$L75:
+ ADD c01, t1, c01
+ fldd alpha, ALPHA
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L78
+#else
+ blbs TMP1, $L78
+#endif
+ .align 4
+
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L78:
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD c02, t3, c02
+ ADD c06, t4, c06
+#ifndef TRMMKERNEL
+ LD b5, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD c01, c02, c01
+ ADD c05, c06, c05
+
+ ADD c01, t1, c01
+ ADD c05, t2, c05
+
+ MUL alpha, c01, c01
+ MUL alpha, c05, c05
+
+#ifndef TRMMKERNEL
+ ADD c01, a5, c01
+ ADD c05, b5, c05
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 1, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 1, KK
+#endif
+ .align 4
+
+$L79:
+ mov BO, B
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 2, KK
+#else
+ unop
+#endif
+ unop
+ unop
+ .align 4
+
+$L80:
+ and N, 1, J
+ ble J, $L999
+
+ mov C, C1
+ mov A, AO
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ sra M, 2, I
+ ble I, $L100
+ .align 4
+
+$L91:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 4, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ mov B, BO
+ unop
+ ble L, $L95
+#else
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ unop
+ ble L, $L95
+#endif
+ .align 5
+
+$L92:
+ ADD c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi L, -1(L)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 8 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 9 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 10 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 11 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ LD a1, 12 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD a2, 13 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b3, t3
+ LD a3, 14 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b3, t4
+ LD a5, 15 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c01, t1, c01
+ MUL a1, b4, t1
+ LD a1, 16 * SIZE(AO)
+ ldi AO, 16 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b4, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L92
+ .align 4
+
+$L95:
+#ifndef TRMMKERNEL
+ and K, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ fldd alpha, ALPHA
+ unop
+ ble L, $L98
+ .align 4
+
+$L96:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 1 * SIZE(BO)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ldi AO, 4 * SIZE(AO)
+ bgt L, $L96
+ .align 4
+
+$L98:
+#ifndef TRMMKERNEL
+ ADD c01, t1, c01
+ LD c05, 0 * SIZE(C1)
+ ADD c02, t2, c02
+ LD c06, 1 * SIZE(C1)
+ ADD c03, t3, c03
+ LD c07, 2 * SIZE(C1)
+ ADD c04, t4, c04
+ LD c08, 3 * SIZE(C1)
+#else
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+#endif
+
+ MUL alpha, c01, c01
+ MUL alpha, c02, c02
+ MUL alpha, c03, c03
+ MUL alpha, c04, c04
+
+#ifndef TRMMKERNEL
+ ADD c01, c05, c01
+ ADD c02, c06, c02
+ ADD c03, c07, c03
+ ADD c04, c08, c04
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ldi C1, 4 * SIZE(C1)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 4, TMP1
+#else
+ subl TMP1, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L91
+ .align 4
+
+$L100:
+ and M, 2, I
+ unop
+ unop
+ ble I, $L110
+ .align 4
+
+$L101:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ mov B, BO
+ unop
+ ble L, $L105
+#else
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ unop
+ ble L, $L105
+#endif
+ .align 5
+
+$L102:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c03, t3, c03
+ ldi BO, 4 * SIZE(BO)
+ MUL a3, b2, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a5, 7 * SIZE(AO)
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, c01
+ MUL a1, b3, t1
+ LD a1, 8 * SIZE(AO)
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L102
+ .align 4
+
+$L105:
+#ifndef TRMMKERNEL
+ and K, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ fldd alpha, ALPHA
+#ifndef TRMMKERNEL
+ LD a3, 0 * SIZE(C1)
+ LD a4, 1 * SIZE(C1)
+#endif
+ ble L, $L108
+ .align 4
+
+$L106:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 2 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ LD a2, 3 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi AO, 2 * SIZE(AO)
+ unop
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L106
+ .align 4
+
+$L108:
+ ADD c01, t1, c01
+ fclr t1
+ ADD c02, t2, c02
+ fclr t2
+ ADD c03, t3, c03
+ fclr t3
+ ADD c04, t4, c04
+ fclr t4
+
+ ADD c01, c03, c01
+ ADD c02, c04, c02
+
+ MUL alpha, c01, c01
+ MUL alpha, c02, c02
+
+#ifndef TRMMKERNEL
+ ADD c01, a3, c01
+ ADD c02, a4, c02
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ldi C1, 2 * SIZE(C1)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+ .align 4
+
+$L110:
+ and M, 1, I
+ ble I, $L999
+ .align 4
+
+$L111:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ mov B, BO
+ unop
+ ble L, $L115
+#else
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+#ifndef TRMMKERNEL
+ sra K, 2, L
+#else
+ sra TMP1, 2, L
+#endif
+ unop
+ ble L, $L115
+#endif
+ .align 4
+
+$L112:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c02, t2, c02
+ MUL a2, b2, t2
+ LD a2, 5 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c03, t3, c03
+ MUL a3, b3, t3
+ LD a3, 6 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c04, t4, c04
+ MUL a4, b4, t4
+ LD a4, 7 * SIZE(AO)
+ LD b4, 7 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 4 * SIZE(AO)
+ ldi BO, 4 * SIZE(BO)
+ bgt L, $L112
+ .align 4
+
+$L115:
+#ifndef TRMMKERNEL
+ and K, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ fldd alpha, ALPHA
+#ifndef TRMMKERNEL
+ LD a2, 0 * SIZE(C1)
+#endif
+ ble L, $L118
+ .align 4
+
+$L116:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+ LD a1, 1 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 1 * SIZE(AO)
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L116
+ .align 4
+
+$L118:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+ ADD c01, c02, c01
+ ADD c03, c04, c03
+ ADD c01, c03, c01
+
+ MUL alpha, c01, c01
+#ifndef TRMMKERNEL
+ ADD c01, a2, c01
+#endif
+ ST c01, 0 * SIZE(C1)
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/gemm_kernel_simd_16x4.S b/kernel/sw_64/gemm_kernel_simd_16x4.S
new file mode 100644
index 0000000..1acf679
--- /dev/null
+++ b/kernel/sw_64/gemm_kernel_simd_16x4.S
@@ -0,0 +1,4054 @@
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+
+#define STACKSIZE 336
+
+#define CO $1
+#define C1 $2
+#define C2 $3
+#define C3 $4
+
+#define LDM $5
+
+#define PREB $7
+#define SPANA $8
+#define SPANB $9
+#define NC1 $10
+#define KC1 $11
+#define MC1 $12
+#define PREA $13
+
+#define A $20
+#define B $21
+#define C $19
+#define MC $16
+#define NC $17
+#define KC $18
+
+#define A1 $22
+#define B1 $23
+
+#define ALPHA $f8
+
+#define a0 $f0
+#define a4 $f1
+#define a8 $f2
+#define a12 $f3
+
+#define b0 $f4
+#define b1 $f5
+#define b2 $f6
+#define b3 $f7
+
+#define na0 $f0
+#define na4 $f8
+#define na8 $f9
+#define na12 $f10
+
+#define nb0 $f11
+#define nb1 $f12
+#define nb2 $f13
+#define nb3 $f14
+
+#define t00 $f15
+#define t01 $f16
+#define t02 $f17
+#define t03 $f18
+#define t04 $f19
+#define t05 $f20
+#define t06 $f21
+#define t07 $f22
+#define t08 $f23
+#define t09 $f24
+#define t10 $f25
+#define t11 $f26
+#define t12 $f27
+#define t13 $f28
+#define t14 $f29
+#define t15 $f30
+
+#define c00 $f1
+#define c01 $f2
+#define c02 $f3
+#define c03 $f4
+
+#define c04 $f5
+#define c05 $f6
+#define c06 $f7
+#define c07 $f9
+
+#define c08 $f10
+#define c09 $f11
+#define c10 $f12
+#define c11 $f13
+
+#define c12 $f1
+#define c13 $f2
+#define c14 $f3
+#define c15 $f4
+
+#if defined(TRMMKERNEL)
+#define TEMP $14
+#define KK $24
+#define OFFSET $25
+#endif
+
+ PROLOGUE
+ PROFCODE
+
+.frame $30,STACKSIZE,$26,0
+ldi $sp,-STACKSIZE($sp) # # [2]
+
+ stl $9,328($sp) # Integer Saved Register
+ stl $10,320($sp)
+ stl $11,312($sp)
+ stl $12,304($sp)
+ stl $13,296($sp)
+ stl $14,288($sp)
+
+
+ ST $f2,280($sp) # Float Saved Register
+ ST $f3,272($sp)
+ ST $f4,264($sp)
+ ST $f5,256($sp)
+ ST $f6,248($sp)
+ ST $f7,240($sp)
+ ST $f8,232($sp)
+ ST $f9,224($sp)
+
+
+
+ .align 5
+
+$Begin_NC_Unroll4:
+ ldl C, 0 + STACKSIZE($sp) # load C
+ ldl LDM, 8 + STACKSIZE($sp) # load ldm
+
+#ifdef TRMMKERNEL
+ ldl OFFSET, 16 + STACKSIZE($sp) # load offset
+ nop
+#endif
+
+ ST $f19, 192($sp) # store alpha
+ SXADDQ LDM, 0, LDM # ldm*X+0
+
+ mov NC, NC1 # backup nc
+ mov KC, KC1 # backup kc
+ mov MC, MC1 # backup mc
+
+ mov B, B1 # backup the initial address of b
+ sra NC1,2,NC # NC=NC1/4 Unroll N 4
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ subl $31, OFFSET, KK # when trmm at right
+ nop
+#endif
+
+ mov A, A1 # backup the initial address of a
+ sll KC1,1+BASE_SHIFT,SPANB # kc*2nr
+
+ sll KC1,4+BASE_SHIFT,SPANA # kc*16mr
+ beq NC,$Begin_NC_Unroll2
+
+
+ .align 5
+
+.L0:
+ sra MC1,4,MC # MC=MC1/16
+ mov C, CO # compute c pointer
+
+ addl B1,SPANB,PREB # prefetch B
+ addl A1,SPANA,PREA # prefetch A
+
+ addl C, LDM, C1
+ addl C1,LDM, C2
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET,KK # Reset the left offset
+ nop
+#endif
+
+ subl PREA,16*SIZE,PREA # prea=kc1*mc-mc
+ addl C2,LDM, C3
+
+ s4addl LDM,C,C # C=ldm*4+C
+ beq MC,.L15 # MC=0:MC1<16
+
+
+ .align 5 # nr=4,mr=4-----------------------------
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B # LL && RU reset B
+ nop
+#else
+ sll KK, 4 + BASE_SHIFT, KC # KK*16
+ sll KK, 2 + BASE_SHIFT, TEMP # KK*4
+
+ addl A, KC, A # mov A point to the data part
+ addl B1,TEMP,B # mov B point to the data part
+#endif
+
+ vcpys $f31,$f31,t00 # CLEAR Results Register
+ fillcs 0(CO) # prefetch C
+ fillcs 0(C1)
+
+ vcpys $f31,$f31,t01 # 64 results
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+ vcpys $f31,$f31,t02
+ LDDE b0,0*SIZE(B)
+ LDDE b1,1*SIZE(B)
+
+ vcpys $f31,$f31,t03
+ LDDE b2,2*SIZE(B)
+ LDDE b3,3*SIZE(B)
+
+ vcpys $f31,$f31,t04
+ fillcs 4(CO) # prefetch C
+ fillcs 4(C1)
+
+ vcpys $f31,$f31,t05
+ fillcs 4(C2)
+ fillcs 4(C3)
+
+ vcpys $f31,$f31,t06
+ VLD a0, 0*SIZE(A)
+ VLD a4, 4*SIZE(A)
+
+ vcpys $f31,$f31,t07
+ VLD a8, 8*SIZE(A)
+ VLD a12,12*SIZE(A)
+
+ vcpys $f31,$f31,t08
+ fillcs 8*SIZE(CO)
+ fillcs 8*SIZE(C1)
+
+ vcpys $f31,$f31,t09
+ fillcs 8*SIZE(C2)
+ fillcs 8*SIZE(C3)
+
+ vcpys $f31,$f31,t10
+ fillcs 12*SIZE(CO)
+ fillcs 12*SIZE(C1)
+
+ vcpys $f31,$f31,t11
+ fillcs 12*SIZE(C2)
+ fillcs 12*SIZE(C3)
+
+ vcpys $f31,$f31,t12
+ vcpys $f31,$f31,t13
+ vcpys $f31,$f31,t14
+ vcpys $f31,$f31,t15
+
+
+#if (defined(LEFT) && !defined(TRANSA)) \
+ ||(!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP # temp is the length of data part
+#elif defined(LEFT)
+ addl KK, 16, TEMP # mr=16
+#else
+ addl KK, 4, TEMP # right nr=4
+#endif
+ sra TEMP, 1, KC # KC=TEMP/2
+
+ nop
+ beq KC, $Rest_16x4x1
+
+#else
+
+ vcpys $f31,$f31,t00 # CLEAR Results Register
+ mov B1,B # Reset B
+ sra KC1,1,KC # Unroll Kr=2, KC=KC1/2
+
+ vcpys $f31,$f31,t01 # 64 results
+ fillcs 0(CO) # prefetch C
+ fillcs 0(C1)
+
+ vcpys $f31,$f31,t02
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+ vcpys $f31,$f31,t03
+ LDDE b0,0*SIZE(B)
+ LDDE b1,1*SIZE(B)
+
+ vcpys $f31,$f31,t04
+ LDDE b2,2*SIZE(B)
+ LDDE b3,3*SIZE(B)
+
+ vcpys $f31,$f31,t05
+ fillcs 4(CO) # prefetch C
+ fillcs 4(C1)
+
+ vcpys $f31,$f31,t06
+ fillcs 4(C2)
+ fillcs 4(C3)
+
+ vcpys $f31,$f31,t07
+ VLD a0, 0*SIZE(A)
+ VLD a4, 4*SIZE(A)
+
+ vcpys $f31,$f31,t08
+ VLD a8, 8*SIZE(A)
+ VLD a12,12*SIZE(A)
+
+ vcpys $f31,$f31,t09
+ fillcs 8(CO) # prefetch C
+ fillcs 8(C1)
+
+ vcpys $f31,$f31,t10
+ fillcs 8(C2)
+ fillcs 8(C3)
+
+ vcpys $f31,$f31,t11
+ fillcs 12*SIZE(CO)
+ fillcs 12*SIZE(C1)
+
+ vcpys $f31,$f31,t12
+ fillcs 12*SIZE(C2)
+ fillcs 12*SIZE(C3)
+
+ vcpys $f31,$f31,t13
+ vcpys $f31,$f31,t14
+
+ vcpys $f31,$f31,t15
+ beq KC,$Rest_16x4x1 # KC1<2 goto $Rest_16x4x1
+
+#endif
+
+ .align 5
+
+$Panel_16x4x2: # nr=4,mr=4,kr=2------------------------
+
+ VMAD a0,b0,t00,t00
+ addl A,16*SIZE,A # 16a*1k
+ LDDE nb0,4*SIZE(B) # get next 4b
+
+ VMAD a0,b1,t04,t04
+ LDDE nb1,5*SIZE(B)
+
+ VMAD a4,b0,t01,t01
+ VLD na12,12*SIZE(A)
+
+ VMAD a4,b1,t05,t05
+ VLD na8,8*SIZE(A)
+
+ VMAD a0,b2,t08,t08
+ LDDE nb2,6*SIZE(B)
+
+ VMAD a0,b3,t12,t12
+ LDDE nb3,7*SIZE(B)
+
+ VMAD a8,b0,t02,t02
+ VMAD a8,b1,t06,t06
+
+ VMAD a4,b2,t09,t09
+ addl B,8*SIZE,B # 4b*2k
+ VLD na0,0*SIZE(A) # carefule na0=a0 use the same register
+
+ VMAD a4,b3,t13,t13
+ VLD na4,4*SIZE(A) # get next 16a
+
+ VMAD a12,b0,t03,t03
+ VMAD a12,b1,t07,t07
+
+ VMAD a8,b2,t10,t10
+ fillcs 0(PREB)
+
+ VMAD a8,b3,t14,t14
+ fillcs 0(PREA)
+
+ VMAD a12,b2,t11,t11
+ fillcs 8*SIZE(PREA)
+
+ VMAD a12,b3,t15,t15
+ subl KC,1,KC # loop k --
+
+
+ VMAD na12,nb0,t03,t03
+ addl A,16*SIZE,A # ### next k ###
+ LDDE b0,0(B) # get 3rd 4b
+
+ VMAD na12,nb1,t07,t07
+ LDDE b1,1*SIZE(B)
+
+ VMAD na8,nb0,t02,t02
+ VLD a12,12*SIZE(A)
+
+ VMAD na8,nb1,t06,t06
+ VLD a8,8*SIZE(A)
+
+ VMAD na0,nb0,t00,t00
+ subl PREA,16*SIZE,PREA # prea-=16
+ LDDE b2,2*SIZE(B)
+
+ VMAD na0,nb1,t04,t04
+ LDDE b3,3*SIZE(B)
+
+ VMAD na12,nb2,t11,t11
+ VMAD na12,nb3,t15,t15
+ VMAD na8,nb2,t10,t10
+ VMAD na8,nb3,t14,t14
+
+ VMAD na0,nb2,t08,t08
+ fillcs 0(PREA)
+
+ VMAD na0,nb3,t12,t12
+ fillcs 4*SIZE(PREB)
+
+ VMAD na4,nb0,t01,t01
+ VLD a0,0(A) # get 3rd 16a
+
+ VMAD na4,nb1,t05,t05
+ VLD a4,4*SIZE(A)
+
+ VMAD na4,nb2,t09,t09
+ fillcs 8*SIZE(PREA)
+ addl PREB,8*SIZE,PREB # preb+=8
+
+ VMAD na4,nb3,t13,t13
+ subl PREA,16*SIZE,PREA # prea-=16
+ bne KC,$Panel_16x4x2
+
+
+$Rest_16x4x1:
+ LDDE ALPHA, 192($sp) # get alpha
+#ifndef TRMMKERNEL
+ blbc KC1, $Write_16x4
+#else
+ blbc TEMP,$Write_16x4
+#endif
+
+ VMAD a0,b0,t00,t00
+ addl A,16*SIZE,A # 16a*1k
+
+ VMAD a0,b1,t04,t04
+ addl B,4*SIZE,B # 4b*1k
+
+ VMAD a0,b2,t08,t08
+ VMAD a0,b3,t12,t12
+
+
+ VMAD a4,b0,t01,t01
+ VMAD a4,b1,t05,t05
+ VMAD a4,b2,t09,t09
+ VMAD a4,b3,t13,t13
+
+ VMAD a8,b0,t02,t02
+ VMAD a8,b1,t06,t06
+ VMAD a8,b2,t10,t10
+ VMAD a8,b3,t14,t14
+
+ VMAD a12,b0,t03,t03
+ VMAD a12,b1,t07,t07
+ VMAD a12,b2,t11,t11
+ VMAD a12,b3,t15,t15
+
+
+ .align 5
+
+$Write_16x4:
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1), $6 ### gemm part ####
+ bne $6, $UnAlign_CO_Access_16x4
+
+$Align_CO_Access_16x4:
+ VLD c00,0(CO)
+ VLD c01,4*SIZE(CO)
+ VLD c02,8*SIZE(CO)
+ VLD c03,12*SIZE(CO)
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+ VMAD t02,ALPHA,c02,t02
+ VMAD t03,ALPHA,c03,t03
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ VST t02,8*SIZE(CO)
+ VST t03,12*SIZE(CO)
+ jmp $Access_C1_16x4
+
+$UnAlign_CO_Access_16x4:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c04, 1*VEC_LEN*SIZE(CO)
+
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
+ VLD_UH c05, 2*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c04,c00
+ VLD_UL c02, 2*VEC_LEN*SIZE(CO)
+ VLD_UH c06, 3*VEC_LEN*SIZE(CO)
+
+ vbisw c01,c05,c01
+ VLD_UL c03, 3*VEC_LEN*SIZE(CO)
+ VLD_UH c07, 4*VEC_LEN*SIZE(CO)
+
+ vbisw c02,c06,c02
+ vbisw c03,c07,c03
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+
+ VMAD t02,ALPHA,c02,t02
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VMAD t03,ALPHA,c03,t03
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
+
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_16x4:
+ and C1, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_C1_Access_16x4
+
+$Align_C1_Access_16x4:
+ VLD c04,0(C1)
+ VLD c05,4*SIZE(C1)
+ VLD c06,8*SIZE(C1)
+ VLD c07,12*SIZE(C1)
+
+ VMAD t04,ALPHA,c04,t04
+ VMAD t05,ALPHA,c05,t05
+ VMAD t06,ALPHA,c06,t06
+ VMAD t07,ALPHA,c07,t07
+
+ VST t04,0(C1)
+ VST t05,4*SIZE(C1)
+ VST t06,8*SIZE(C1)
+ VST t07,12*SIZE(C1)
+ jmp $Access_C2_16x4
+
+$UnAlign_C1_Access_16x4:
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
+ VLD_UH t00, 1*VEC_LEN*SIZE(C1)
+
+ VLD_UL c05, 1*VEC_LEN*SIZE(C1)
+ VLD_UH t01, 2*VEC_LEN*SIZE(C1)
+
+ vbisw c04,t00,c04
+ VLD_UL c06, 2*VEC_LEN*SIZE(C1)
+ VLD_UH t02, 3*VEC_LEN*SIZE(C1)
+
+ vbisw c05,t01,c05
+ VLD_UL c07, 3*VEC_LEN*SIZE(C1)
+ VLD_UH t03, 4*VEC_LEN*SIZE(C1)
+
+ vbisw c06,t02,c06
+ vbisw c07,t03,c07
+
+ VMAD t04,ALPHA,c04,t04
+ VMAD t05,ALPHA,c05,t05
+
+ VMAD t06,ALPHA,c06,t06
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+ VMAD t07,ALPHA,c07,t07
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
+
+ VST_UL t06, 2*VEC_LEN*SIZE(C1)
+ VST_UH t06, 3*VEC_LEN*SIZE(C1)
+
+ VST_UL t07, 3*VEC_LEN*SIZE(C1)
+ VST_UH t07, 4*VEC_LEN*SIZE(C1)
+
+
+$Access_C2_16x4:
+ and C2, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_C2_Access_16x4
+
+ $Align_C2_Access_16x4:
+ VLD c08,0(C2)
+ VLD c09,4*SIZE(C2)
+ VLD c10,8*SIZE(C2)
+ VLD c11,12*SIZE(C2)
+
+ VMAD t08,ALPHA,c08,t08
+ VMAD t09,ALPHA,c09,t09
+ VMAD t10,ALPHA,c10,t10
+ VMAD t11,ALPHA,c11,t11
+
+ VST t08,0(C2)
+ VST t09,4*SIZE(C2)
+ VST t10,8*SIZE(C2)
+ VST t11,12*SIZE(C2)
+ jmp $Access_C3_16x4
+
+$UnAlign_C2_Access_16x4:
+ VLD_UL c08, 0*VEC_LEN*SIZE(C2)
+ VLD_UH t00, 1*VEC_LEN*SIZE(C2)
+
+ VLD_UL c09, 1*VEC_LEN*SIZE(C2)
+ VLD_UH t01, 2*VEC_LEN*SIZE(C2)
+
+ vbisw c08,t00,c08
+ VLD_UL c10, 2*VEC_LEN*SIZE(C2)
+ VLD_UH t02, 3*VEC_LEN*SIZE(C2)
+
+ vbisw c09,t01,c09
+ VLD_UL c11, 3*VEC_LEN*SIZE(C2)
+ VLD_UH t03, 4*VEC_LEN*SIZE(C2)
+
+ vbisw c10,t02,c10
+ vbisw c11,t03,c11
+
+ VMAD t08,ALPHA,c08,t08
+ VMAD t09,ALPHA,c09,t09
+
+ VMAD t10,ALPHA,c10,t10
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
+
+ VMAD t11,ALPHA,c11,t11
+ VST_UL t09, 1*VEC_LEN*SIZE(C2)
+ VST_UH t09, 2*VEC_LEN*SIZE(C2)
+
+ VST_UL t10, 2*VEC_LEN*SIZE(C2)
+ VST_UH t10, 3*VEC_LEN*SIZE(C2)
+
+ VST_UL t11, 3*VEC_LEN*SIZE(C2)
+ VST_UH t11, 4*VEC_LEN*SIZE(C2)
+
+
+$Access_C3_16x4:
+ and C3, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_C3_Access_16x4
+
+$Align_C3_Access_16x4:
+ VLD c12,0(C3)
+ VLD c13,4*SIZE(C3)
+ VLD c14,8*SIZE(C3)
+ VLD c15,12*SIZE(C3)
+
+ VMAD t12,ALPHA,c12,t12
+ VMAD t13,ALPHA,c13,t13
+ VMAD t14,ALPHA,c14,t14
+ VMAD t15,ALPHA,c15,t15
+
+ VST t12,0(C3)
+ VST t13,4*SIZE(C3)
+ VST t14,8*SIZE(C3)
+ VST t15,12*SIZE(C3)
+ jmp $End_NC_Unroll4
+
+$UnAlign_C3_Access_16x4:
+ VLD_UL c12, 0*VEC_LEN*SIZE(C3)
+ VLD_UH t04, 1*VEC_LEN*SIZE(C3)
+
+ VLD_UL c13, 1*VEC_LEN*SIZE(C3)
+ VLD_UH t05, 2*VEC_LEN*SIZE(C3)
+
+ vbisw c12,t04,c12
+ VLD_UL c14, 2*VEC_LEN*SIZE(C3)
+ VLD_UH t06, 3*VEC_LEN*SIZE(C3)
+
+ vbisw c13,t05,c13
+ VLD_UL c15, 3*VEC_LEN*SIZE(C3)
+ VLD_UH t07, 4*VEC_LEN*SIZE(C3)
+
+ vbisw c14,t06,c14
+ vbisw c15,t07,c15
+
+ VMAD t12,ALPHA,c12,t12
+ VMAD t13,ALPHA,c13,t13
+
+ VMAD t14,ALPHA,c14,t14
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
+
+ VMAD t15,ALPHA,c15,t15
+ VST_UL t13, 1*VEC_LEN*SIZE(C3)
+ VST_UH t13, 2*VEC_LEN*SIZE(C3)
+
+ VST_UL t14, 2*VEC_LEN*SIZE(C3)
+ VST_UH t14, 3*VEC_LEN*SIZE(C3)
+
+ VST_UL t15, 3*VEC_LEN*SIZE(C3)
+ VST_UH t15, 4*VEC_LEN*SIZE(C3)
+ jmp $End_NC_Unroll4
+
+#else
+ and CO, (VEC_LEN*SIZE-1),$6 ### trmm part ###
+ bne $6,$UnAlign_CO_Access_16x4
+
+$Align_CO_Access_16x4:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+ VMUL t02,ALPHA,t02
+ VMUL t03,ALPHA,t03
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ VST t02,8*SIZE(CO)
+ VST t03,12*SIZE(CO)
+ jmp $Access_C1_16x4
+
+$UnAlign_CO_Access_16x4:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+
+ VMUL t02,ALPHA,t02
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VMUL t03,ALPHA,t03
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
+
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_16x4:
+ and C1, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_C1_Access_16x4
+
+$Align_C1_Access_16x4:
+ VMUL t04,ALPHA,t04
+ VMUL t05,ALPHA,t05
+ VMUL t06,ALPHA,t06
+ VMUL t07,ALPHA,t07
+
+ VST t04,0(C1)
+ VST t05,4*SIZE(C1)
+ VST t06,8*SIZE(C1)
+ VST t07,12*SIZE(C1)
+ jmp $Access_C2_16x4
+
+$UnAlign_C1_Access_16x4:
+ VMUL t04,ALPHA,t04
+ VMUL t05,ALPHA,t05
+
+ VMUL t06,ALPHA,t06
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+ VMUL t07,ALPHA,t07
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
+
+ VST_UL t06, 2*VEC_LEN*SIZE(C1)
+ VST_UH t06, 3*VEC_LEN*SIZE(C1)
+
+ VST_UL t07, 3*VEC_LEN*SIZE(C1)
+ VST_UH t07, 4*VEC_LEN*SIZE(C1)
+
+
+$Access_C2_16x4:
+ and C2, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_C2_Access_16x4
+
+$Align_C2_Access_16x4:
+ VMUL t08,ALPHA,t08
+ VMUL t09,ALPHA,t09
+ VMUL t10,ALPHA,t10
+ VMUL t11,ALPHA,t11
+
+ VST t08,0(C2)
+ VST t09,4*SIZE(C2)
+ VST t10,8*SIZE(C2)
+ VST t11,12*SIZE(C2)
+ jmp $Access_C3_16x4
+
+$UnAlign_C2_Access_16x4:
+ VMUL t08,ALPHA,t08
+ VMUL t09,ALPHA,t09
+
+ VMUL t10,ALPHA,t10
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
+
+ VMUL t11,ALPHA,t11
+ VST_UL t09, 1*VEC_LEN*SIZE(C2)
+ VST_UH t09, 2*VEC_LEN*SIZE(C2)
+
+ VST_UL t10, 2*VEC_LEN*SIZE(C2)
+ VST_UH t10, 3*VEC_LEN*SIZE(C2)
+
+ VST_UL t11, 3*VEC_LEN*SIZE(C2)
+ VST_UH t11, 4*VEC_LEN*SIZE(C2)
+
+
+$Access_C3_16x4:
+ and C3, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_C3_Access_16x4
+
+$Align_C3_Access_16x4:
+ VMUL t12,ALPHA,t12
+ VMUL t13,ALPHA,t13
+ VMUL t14,ALPHA,t14
+ VMUL t15,ALPHA,t15
+
+ VST t12,0(C3)
+ VST t13,4*SIZE(C3)
+ VST t14,8*SIZE(C3)
+ VST t15,12*SIZE(C3)
+ jmp $TRMMKERNEL_16x4
+
+$UnAlign_C3_Access_16x4:
+ VMUL t12,ALPHA,t12
+ VMUL t13,ALPHA,t13
+
+ VMUL t14,ALPHA,t14
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
+
+ VMUL t15,ALPHA,t15
+ VST_UL t13, 1*VEC_LEN*SIZE(C3)
+ VST_UH t13, 2*VEC_LEN*SIZE(C3)
+
+ VST_UL t14, 2*VEC_LEN*SIZE(C3)
+ VST_UH t14, 3*VEC_LEN*SIZE(C3)
+
+ VST_UL t15, 3*VEC_LEN*SIZE(C3)
+ VST_UH t15, 4*VEC_LEN*SIZE(C3)
+
+
+$TRMMKERNEL_16x4:
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP # nodata length
+#ifdef LEFT
+ subl TEMP, 16, TEMP # mr=16
+#else
+ subl TEMP, 4, TEMP # nr=4
+#endif
+
+ sll TEMP, 4 + BASE_SHIFT,KC # mr=16
+ sll TEMP, 2 + BASE_SHIFT,TEMP # nr=4
+
+ addl A, KC, A # mov A to the end of this panel
+ addl B, TEMP,B # mov B to the end of this panel
+#endif
+
+#ifdef LEFT
+ addl KK, 16 ,KK
+#endif
+ nop
+ jmp $End_NC_Unroll4
+#endif
+
+
+ .align 5
+
+.L15: # n=4,m=8-----------------------------
+ and MC1,8,MC
+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc
+ nop
+ beq MC,.L16
+
+ addl A1,SPANA,PREA
+ subl PREA,8*SIZE,PREA # PREA-=MC
+
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA))\
+ || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B # set B
+ nop
+#else
+ sll KK, 3 + BASE_SHIFT,KC # mr=8
+ sll KK, 2 + BASE_SHIFT,TEMP # nr=4
+
+ addl A,KC,A
+ addl B1,TEMP,B
+#endif
+
+ vcpys $f31,$f31,t00 # clear (32 results)
+ vcpys $f31,$f31,t01
+ vcpys $f31,$f31,t04
+ vcpys $f31,$f31,t05
+
+ LDDE b0,0(B)
+ LDDE b1,1*SIZE(B)
+ LDDE b2,2*SIZE(B)
+ LDDE b3,3*SIZE(B)
+
+ vcpys $f31,$f31,t08
+ vcpys $f31,$f31,t09
+ vcpys $f31,$f31,t12
+ vcpys $f31,$f31,t13
+
+ VLD a0,0(A) # get 8 A
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+ fillcs 4*SIZE(CO) #
+ fillcs 4*SIZE(C1)
+ fillcs 4*SIZE(C2)
+ fillcs 4*SIZE(C3)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP # temp is the length of the data part
+#elif defined(LEFT)
+ addl KK, 8, TEMP # mr=8
+#else
+ addl KK, 4, TEMP # nr=4
+#endif
+ sra TEMP,1, KC # kc/2
+ beq KC,$Rest_8x4x1
+
+#else
+
+ mov B1,B # Reset B
+ sra KC1,1,KC # unroll kc as 2, kc=kc1/2
+ vcpys $f31,$f31,t00 # clear (32 results)
+ vcpys $f31,$f31,t01
+ vcpys $f31,$f31,t04
+ vcpys $f31,$f31,t05
+
+ LDDE b0,0(B)
+ LDDE b1,1*SIZE(B)
+ LDDE b2,2*SIZE(B)
+ LDDE b3,3*SIZE(B)
+
+ vcpys $f31,$f31,t08
+ vcpys $f31,$f31,t09
+ vcpys $f31,$f31,t12
+ vcpys $f31,$f31,t13
+
+ VLD a0,0(A) # get 8 A
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+ fillcs 4*SIZE(CO) #
+ fillcs 4*SIZE(C1)
+ fillcs 4*SIZE(C2)
+ fillcs 4*SIZE(C3)
+
+ beq KC,$Rest_8x4x1
+#endif
+
+ .align 5
+
+$Panel_8x4x2:
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+ VMAD a0,b2,t08,t08
+ VMAD a0,b3,t12,t12
+
+ LDDE nb0,4*SIZE(B) # get next 4b
+ LDDE nb1,5*SIZE(B)
+ LDDE nb2,6*SIZE(B)
+ LDDE nb3,7*SIZE(B)
+
+ addl B,8*SIZE,B # 4n*2k
+ VMAD a4,b0,t01,t01
+ VMAD a4,b1,t05,t05
+ VMAD a4,b2,t09,t09
+ VMAD a4,b3,t13,t13
+
+ VLD na8,8*SIZE(A) # get next 8a
+ VLD na12,12*SIZE(A)
+
+ fillcs 0(PREA)
+ fillcs 4*SIZE(PREA)
+ subl PREA,8*SIZE,PREA # prea -= 8
+
+ subl KC,1,KC
+ addl A,16*SIZE,A # ### next k ###8m*2k
+ VMAD na8,nb0,t00,t00
+ VMAD na8,nb1,t04,t04
+ VMAD na8,nb2,t08,t08
+ VMAD na8,nb3,t12,t12
+
+ LDDE b0,0(B) # get 3rd 4b
+ LDDE b1,1*SIZE(B)
+ LDDE b2,2*SIZE(B)
+ LDDE b3,3*SIZE(B)
+
+ VMAD na12,nb0,t01,t01
+ VMAD na12,nb1,t05,t05
+ VMAD na12,nb2,t09,t09
+ VMAD na12,nb3,t13,t13
+
+ VLD a0,0(A) # get 3rd 8a
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(PREA)
+ fillcs 4*SIZE(PREA)
+ subl PREA,8*SIZE,PREA # prea -= mc
+ bne KC,$Panel_8x4x2 # loop k--
+
+$Rest_8x4x1:
+ LDDE ALPHA, 192($sp) # get alpha
+#ifndef TRMMKERNEL
+ blbc KC1, $Write_8x4
+#else
+ blbc TEMP, $Write_8x4
+#endif
+
+ addl A,8*SIZE,A # 8a*1k
+ addl B,4*SIZE,B # 4b*1K
+
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+ VMAD a0,b2,t08,t08
+ VMAD a0,b3,t12,t12
+
+ fillcs 0(PREA)
+ fillcs 4*SIZE(PREA)
+ subl PREA,8*SIZE,PREA
+
+ VMAD a4,b0,t01,t01
+ VMAD a4,b1,t05,t05
+ VMAD a4,b2,t09,t09
+ VMAD a4,b3,t13,t13
+
+$Write_8x4:
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_8x4
+
+$Align_CO_Access_8x4:
+ VLD c00,0(CO) # get 1st colum of 16c
+ VLD c01,4*SIZE(CO)
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ jmp $Access_C1_8x4
+
+$UnAlign_CO_Access_8x4:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c02, 1*VEC_LEN*SIZE(CO)
+
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
+ VLD_UH c03, 2*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c02,c00
+ vbisw c01,c03,c01
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_8x4:
+ and C1, (VEC_LEN*SIZE-1),$6
+ addl CO,8*SIZE,CO
+ nop
+ bne $6,$UnAlign_C1_Access_8x4
+
+$Align_C1_Access_8x4:
+ VLD c04,0(C1)
+ VLD c05,4*SIZE(C1)
+
+ VMAD t04,ALPHA,c04,t04
+ VMAD t05,ALPHA,c05,t05
+
+ VST t04,0(C1)
+ VST t05,4*SIZE(C1)
+ jmp $Access_C2_8x4
+
+$UnAlign_C1_Access_8x4:
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
+ VLD_UH c06, 1*VEC_LEN*SIZE(C1)
+
+ VLD_UL c05, 1*VEC_LEN*SIZE(C1)
+ VLD_UH c07, 2*VEC_LEN*SIZE(C1)
+
+ vbisw c04,c06,c04
+ vbisw c05,c07,c05
+
+ VMAD t04,ALPHA,c04,t04
+ VMAD t05,ALPHA,c05,t05
+
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
+
+
+$Access_C2_8x4:
+ and C2, (VEC_LEN*SIZE-1),$6
+ addl C1,8*SIZE,C1
+ nop
+ bne $6,$UnAlign_C2_Access_8x4
+
+$Align_C2_Access_8x4:
+ VLD c08,0(C2)
+ VLD c09,4*SIZE(C2)
+
+ VMAD t08,ALPHA,c08,t08
+ VMAD t09,ALPHA,c09,t09
+
+ VST t08,0(C2)
+ VST t09,4*SIZE(C2)
+ jmp $Access_C3_8x4
+
+$UnAlign_C2_Access_8x4:
+ VLD_UL c08, 0*VEC_LEN*SIZE(C2)
+ VLD_UH c10, 1*VEC_LEN*SIZE(C2)
+
+ VLD_UL c09, 1*VEC_LEN*SIZE(C2)
+ VLD_UH c11, 2*VEC_LEN*SIZE(C2)
+
+ vbisw c08,c10,c08
+ vbisw c09,c11,c09
+
+ VMAD t08,ALPHA,c08,t08
+ VMAD t09,ALPHA,c09,t09
+
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
+
+ VST_UL t09, 1*VEC_LEN*SIZE(C2)
+ VST_UH t09, 2*VEC_LEN*SIZE(C2)
+
+
+$Access_C3_8x4:
+ and C3, (VEC_LEN*SIZE-1),$6
+ addl C2,8*SIZE,C2
+ nop
+ bne $6,$UnAlign_C3_Access_8x4
+
+$Align_C3_Access_8x4:
+ VLD c12,0(C3)
+ VLD c13,4*SIZE(C3)
+
+ VMAD t12,ALPHA,c12,t12
+ VMAD t13,ALPHA,c13,t13
+
+ VST t12,0(C3)
+ VST t13,4*SIZE(C3)
+ addl C3,8*SIZE,C3
+ jmp .L16
+
+
+$UnAlign_C3_Access_8x4:
+ VLD_UL c12, 0*VEC_LEN*SIZE(C3)
+ VLD_UH c14, 1*VEC_LEN*SIZE(C3)
+
+ VLD_UL c13, 1*VEC_LEN*SIZE(C3)
+ VLD_UH c15, 2*VEC_LEN*SIZE(C3)
+
+ vbisw c12,c14,c12
+ vbisw c13,c15,c13
+
+ VMAD t12,ALPHA,c12,t12
+ VMAD t13,ALPHA,c13,t13
+
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
+
+ VST_UL t13, 1*VEC_LEN*SIZE(C3)
+ VST_UH t13, 2*VEC_LEN*SIZE(C3)
+ addl C3,8*SIZE,C3
+
+#else
+
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_8x4
+
+$Align_CO_Access_8x4:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ jmp $Access_C1_8x4
+
+$UnAlign_CO_Access_8x4:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_8x4:
+ and C1, (VEC_LEN*SIZE-1),$6
+ addl CO,8*SIZE,CO # 8c
+ nop
+ bne $6,$UnAlign_C1_Access_8x4
+
+$Align_C1_Access_8x4:
+ VMUL t04,ALPHA,t04
+ VMUL t05,ALPHA,t05
+
+ VST t04,0(C1)
+ VST t05,4*SIZE(C1)
+ jmp $Access_C2_8x4
+
+$UnAlign_C1_Access_8x4:
+ VMUL t04,ALPHA,t04
+ VMUL t05,ALPHA,t05
+
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
+
+
+$Access_C2_8x4:
+ and C2, (VEC_LEN*SIZE-1),$6
+ addl C1,8*SIZE,C1 # 8c
+ nop
+ bne $6,$UnAlign_C2_Access_8x4
+
+$Align_C2_Access_8x4:
+ VMUL t08,ALPHA,t08
+ VMUL t09,ALPHA,t09
+
+ VST t08,0(C2)
+ VST t09,4*SIZE(C2)
+ jmp $Access_C3_8x4
+
+$UnAlign_C2_Access_8x4:
+ VMUL t08,ALPHA,t08
+ VMUL t09,ALPHA,t09
+
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
+
+ VST_UL t09, 1*VEC_LEN*SIZE(C2)
+ VST_UH t09, 2*VEC_LEN*SIZE(C2)
+
+
+$Access_C3_8x4:
+ and C3, (VEC_LEN*SIZE-1),$6
+ addl C2,8*SIZE,C2 # 8c
+ nop
+ bne $6,$UnAlign_C3_Access_8x4
+
+$Align_C3_Access_8x4:
+ VMUL t12,ALPHA,t12
+ VMUL t13,ALPHA,t13
+
+ VST t12,0(C3)
+ VST t13,4*SIZE(C3)
+ addl C3,8*SIZE,C3
+ jmp $TRMMKERNEL_8x4
+
+$UnAlign_C3_Access_8x4:
+ VMUL t12,ALPHA,t12
+ VMUL t13,ALPHA,t13
+
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
+
+ VST_UL t13, 1*VEC_LEN*SIZE(C3)
+ VST_UH t13, 2*VEC_LEN*SIZE(C3)
+ addl C3,8*SIZE,C3
+
+$TRMMKERNEL_8x4:
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 8,TEMP # mr=8
+#else
+ subl TEMP, 4,TEMP # nr=4
+#endif
+
+ sll TEMP, 3 + BASE_SHIFT,KC
+ sll TEMP, 2 + BASE_SHIFT,TEMP
+
+ addl A, KC, A # move A, B to the end of this panel
+ addl B, TEMP, B
+#endif
+
+#ifdef LEFT
+ addl KK, 8, KK
+#endif
+#endif
+
+
+
+ .align 5
+
+.L16:
+ and MC1,4,MC # nr=4,mr=4----------------------------
+ sll KC1,2+BASE_SHIFT,SPANA # spana=kc1*mc
+ nop
+ beq MC,.L17
+
+ addl A1,SPANA,PREA
+ subl PREA,4*SIZE,PREA # PREA-=MC
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1,B # Set B
+ nop
+#else
+ sll KK, 2 + BASE_SHIFT,KC # mr=nr=4
+ nop
+
+ addl A, KC, A
+ addl B1,KC, B
+#endif
+
+ vcpys $f31,$f31,t00 # clear 16 register
+ vcpys $f31,$f31,t04
+ vcpys $f31,$f31,t08
+ vcpys $f31,$f31,t12
+
+ LDDE b0,0(B) # get 4b
+ LDDE b1,1*SIZE(B)
+ LDDE b2,2*SIZE(B)
+ LDDE b3,3*SIZE(B)
+
+ VLD a0,0(A) # get 4a
+
+ fillcs 0(CO) # prefetch C
+ fillcs 0(C1)
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#else
+ addl KK, 4, TEMP
+#endif
+ sra TEMP,1,KC
+ nop
+ beq KC,$Rest_4x4x1
+
+#else
+ mov B1,B # Reset B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ vcpys $f31,$f31,t00 # clear 16 register
+ vcpys $f31,$f31,t04
+ vcpys $f31,$f31,t08
+ vcpys $f31,$f31,t12
+
+ LDDE b0,0(B) # get 4b
+ LDDE b1,1*SIZE(B)
+ LDDE b2,2*SIZE(B)
+ LDDE b3,3*SIZE(B)
+
+ VLD a0,0(A) # get 4a
+
+ fillcs 0(CO) # prefetch C
+ fillcs 0(C1)
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+ beq KC,$Rest_4x4x1
+
+#endif
+
+
+$Panel_4x4x2:
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+ VMAD a0,b2,t08,t08
+ VMAD a0,b3,t12,t12
+
+ VLD a4,4*SIZE(A)
+ LDDE nb0,4*SIZE(B) # get next 4b and 4a
+ LDDE nb1,5*SIZE(B)
+ LDDE nb2,6*SIZE(B)
+ LDDE nb3,7*SIZE(B)
+ addl B,8*SIZE,B # 4b*2k
+
+ fillcs 0(PREA)
+ subl PREA,4*SIZE,PREA
+
+ subl KC,1,KC
+ VMAD a4,nb0,t00,t00
+ VMAD a4,nb1,t04,t04
+ VMAD a4,nb2,t08,t08
+ VMAD a4,nb3,t12,t12
+
+ addl A,8*SIZE,A # 4a*2k
+ LDDE b0,0(B) # get 3rd 4b and 4a
+ LDDE b1,1*SIZE(B)
+ LDDE b2,2*SIZE(B)
+ LDDE b3,3*SIZE(B)
+ VLD a0,0(A)
+
+ fillcs 0(PREA)
+ subl PREA,4*SIZE,PREA
+ bne KC,$Panel_4x4x2
+
+
+$Rest_4x4x1:
+ LDDE ALPHA, 192($sp) # Get ALPHA
+#ifndef TRMMKERNEL
+ blbc KC1, $Write_4x4
+#else
+ blbc TEMP, $Write_4x4
+#endif
+
+ addl A,4*SIZE,A # 4a*1k
+ addl B,4*SIZE,B # 4b*1K
+
+ fillcs 0(PREA)
+ subl PREA,4*SIZE,PREA
+
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+ VMAD a0,b2,t08,t08
+ VMAD a0,b3,t12,t12
+
+
+$Write_4x4:
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_4x4
+
+$Align_CO_Access_4x4:
+ VLD c00,0(CO) # get 1st colum of 16c
+ VMAD t00,ALPHA,c00,t00
+ VST t00,0(CO)
+ jmp $Access_C1_4x4
+
+$UnAlign_CO_Access_4x4:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c02, 1*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c02,c00
+
+ VMAD t00,ALPHA,c00,t00
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_4x4:
+ and C1, (VEC_LEN*SIZE-1),$6
+ addl CO,4*SIZE,CO # 4c
+ nop
+ bne $6,$UnAlign_C1_Access_4x4
+
+$Align_C1_Access_4x4:
+ VLD c04,0(C1)
+ VMAD t04,ALPHA,c04,t04
+ VST t04,0(C1)
+ jmp $Access_C2_4x4
+
+$UnAlign_C1_Access_4x4:
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
+ VLD_UH c06, 1*VEC_LEN*SIZE(C1)
+
+ vbisw c04,c06,c04
+
+ VMAD t04,ALPHA,c04,t04
+
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+
+$Access_C2_4x4:
+ and C2, (VEC_LEN*SIZE-1),$6
+ addl C1,4*SIZE,C1 # 4c
+ nop
+ bne $6,$UnAlign_C2_Access_4x4
+
+$Align_C2_Access_4x4:
+ VLD c08,0(C2)
+ VMAD t08,ALPHA,c08,t08
+ VST t08,0(C2)
+ jmp $Access_C3_4x4
+
+$UnAlign_C2_Access_4x4:
+ VLD_UL c08, 0*VEC_LEN*SIZE(C2)
+ VLD_UH c10, 1*VEC_LEN*SIZE(C2)
+
+ vbisw c08,c10,c08
+
+ VMAD t08,ALPHA,c08,t08
+
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
+
+
+$Access_C3_4x4:
+ and C3, (VEC_LEN*SIZE-1),$6
+ addl C2,4*SIZE,C2 # 4c
+ nop
+ bne $6,$UnAlign_C3_Access_4x4
+
+$Align_C3_Access_4x4:
+ VLD c12,0(C3)
+ VMAD t12,ALPHA,c12,t12
+ VST t12,0(C3)
+ addl C3,4*SIZE,C3
+ jmp .L17
+
+$UnAlign_C3_Access_4x4:
+ VLD_UL c12, 0*VEC_LEN*SIZE(C3)
+ VLD_UH c14, 1*VEC_LEN*SIZE(C3)
+
+ vbisw c12,c14,c12
+
+ VMAD t12,ALPHA,c12,t12
+
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
+ addl C3,4*SIZE,C3
+
+
+#else
+
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_4x4
+
+$Align_CO_Access_4x4:
+ VMUL t00,ALPHA,t00
+ VST t00,0(CO)
+ jmp $Access_C1_4x4
+
+$UnAlign_CO_Access_4x4:
+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_4x4:
+ and C1, (VEC_LEN*SIZE-1),$6
+ addl CO,4*SIZE,CO # 4c
+ nop
+ bne $6,$UnAlign_C1_Access_4x4
+
+$Align_C1_Access_4x4:
+ VMUL t04,ALPHA,t04
+ VST t04,0(C1)
+ jmp $Access_C2_4x4
+
+$UnAlign_C1_Access_4x4:
+ VMUL t04,ALPHA,t04
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+
+$Access_C2_4x4:
+ and C2, (VEC_LEN*SIZE-1),$6
+ addl C1,4*SIZE,C1 # 4c
+ nop
+ bne $6,$UnAlign_C2_Access_4x4
+
+$Align_C2_Access_4x4:
+ VMUL t08,ALPHA,t08
+ VST t08,0(C2)
+ jmp $Access_C3_4x4
+
+$UnAlign_C2_Access_4x4:
+ VMUL t08,ALPHA,t08
+ VST_UL t08, 0*VEC_LEN*SIZE(C2)
+ VST_UH t08, 1*VEC_LEN*SIZE(C2)
+
+
+$Access_C3_4x4:
+ and C3, (VEC_LEN*SIZE-1),$6
+ addl C2,4*SIZE,C2 # 4c
+ nop
+ bne $6,$UnAlign_C3_Access_4x4
+
+$Align_C3_Access_4x4:
+ VMUL t12,ALPHA,t12
+ VST t12,0(C3)
+ addl C3,4*SIZE,C3
+ jmp $TRMMKERNEL_4x4
+
+$UnAlign_C3_Access_4x4:
+ VMUL t12,ALPHA,t12
+ VST_UL t12, 0*VEC_LEN*SIZE(C3)
+ VST_UH t12, 1*VEC_LEN*SIZE(C3)
+ addl C3,4*SIZE,C3
+
+$TRMMKERNEL_4x4:
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+ subl TEMP, 4, TEMP # mr=nr=4
+
+ sll TEMP, 2 + BASE_SHIFT,KC
+ nop
+
+ addl A, KC, A # move A B to the end of this panel
+ addl B, KC, B
+#endif
+
+#ifdef LEFT
+ addl KK, 4, KK
+#endif
+#endif
+
+
+
+
+ .align 5
+.L17: # nr=4,mr=2--------------------
+ and MC1,2,MC
+ beq MC,.L18
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA))\
+ || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+#else
+ sll KK, 1 + BASE_SHIFT, KC # mr=2
+ sll KK, 2 + BASE_SHIFT, TEMP # nr=4
+
+ addl A, KC, A
+ addl B1,TEMP, B
+#endif
+
+ fclr t00 # CLEAR 8 register
+ fclr t01
+ fclr t04
+ fclr t05
+ fclr t08
+ fclr t09
+ fclr t12
+ fclr t13
+
+ LD b0,0(B) # get 4b
+ LD b1,1*SIZE(B)
+ LD a0,0(A) # get 2a
+ LD b2,2*SIZE(B)
+ LD b3,3*SIZE(B)
+ LD a4,1*SIZE(A)
+
+ fillcs 0(CO) # prefetch C
+ fillcs 0(C1)
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 2, TEMP # mr=2
+#else
+ addl KK, 4, TEMP # nr=4
+#endif
+ sra TEMP, 1, KC
+ beq KC,$Rest_2x4x1
+
+#else
+ mov B1,B # reset B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ fclr t00 # CLEAR 8 register
+ fclr t01
+ fclr t04
+ fclr t05
+ fclr t08
+ fclr t09
+ fclr t12
+ fclr t13
+
+ LD b0,0(B) # get 4b
+ LD b1,1*SIZE(B)
+ LD a0,0(A) # get 2a
+ LD b2,2*SIZE(B)
+ LD b3,3*SIZE(B)
+ LD a4,1*SIZE(A)
+
+ fillcs 0(CO) # prefetch C
+ fillcs 0(C1)
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+ beq KC,$Rest_2x4x1
+#endif
+
+
+$Panel_2x4x2:
+ MAD a0,b0,t00,t00
+ MAD a0,b1,t04,t04
+ MAD a0,b2,t08,t08
+ MAD a0,b3,t12,t12
+
+ LD nb0,4*SIZE(B) # get next 4b and 2a
+ LD nb1,5*SIZE(B)
+ LD a8,2*SIZE(A)
+ LD nb2,6*SIZE(B)
+ LD nb3,7*SIZE(B)
+ LD a12,3*SIZE(A)
+ addl B,8*SIZE,B # 4b*2k
+
+ MAD a4,b0,t01,t01
+ MAD a4,b1,t05,t05
+ MAD a4,b2,t09,t09
+ MAD a4,b3,t13,t13
+
+ subl KC,1,KC
+ MAD a8,nb0,t00,t00
+ MAD a8,nb1,t04,t04
+ MAD a8,nb2,t08,t08
+ MAD a8,nb3,t12,t12
+
+ addl A,4*SIZE,A # 2a*2k
+ LD b0,0(B) # get 3rd 4b and 2a
+ LD b1,1*SIZE(B)
+ LD a0,0(A)
+ LD b2,2*SIZE(B)
+ LD b3,3*SIZE(B)
+ LD a4,1*SIZE(A)
+
+ MAD a12,nb0,t01,t01
+ MAD a12,nb1,t05,t05
+ MAD a12,nb2,t09,t09
+ MAD a12,nb3,t13,t13
+
+ bne KC,$Panel_2x4x2
+
+
+$Rest_2x4x1:
+ LD ALPHA, 192($sp) # get alpha
+#ifndef TRMMKERNEL
+ blbc KC1, $Write_2x4
+#else
+ blbc TEMP, $Write_2x4
+#endif
+
+ addl A,2*SIZE,A # 2a*1k
+ addl B,4*SIZE,B # 4b*1K
+
+ MAD a0,b0,t00,t00
+ MAD a0,b1,t04,t04
+ MAD a0,b2,t08,t08
+ MAD a0,b3,t12,t12
+
+ MAD a4,b0,t01,t01
+ MAD a4,b1,t05,t05
+ MAD a4,b2,t09,t09
+ MAD a4,b3,t13,t13
+
+$Write_2x4:
+#ifndef TRMMKERNEL
+ LD c00,0(CO)
+ LD c01,1*SIZE(CO)
+ LD c04,0(C1)
+ LD c05,1*SIZE(C1)
+
+ MAD t00,ALPHA,c00,t00
+ MAD t01,ALPHA,c01,t01
+
+ LD c08,0(C2)
+ LD c09,1*SIZE(C2)
+
+ MAD t04,ALPHA,c04,t04
+ MAD t05,ALPHA,c05,t05
+
+ LD c12,0(C3)
+ LD c13,1*SIZE(C3)
+
+ MAD t08,ALPHA,c08,t08
+ MAD t09,ALPHA,c09,t09
+
+ addl CO,2*SIZE,CO # 2c
+ addl C1,2*SIZE,C1
+ addl C2,2*SIZE,C2
+ addl C3,2*SIZE,C3
+
+ ST t00,-2*SIZE(CO) # 2c
+ ST t01,-1*SIZE(CO)
+
+ MAD t12,ALPHA,c12,t12
+ MAD t13,ALPHA,c13,t13
+
+ ST t04,-2*SIZE(C1)
+ ST t05,-1*SIZE(C1)
+
+ ST t08,-2*SIZE(C2)
+ ST t09,-1*SIZE(C2)
+
+ ST t12,-2*SIZE(C3)
+ ST t13,-1*SIZE(C3)
+
+#else
+ MUL t00,ALPHA,t00
+ MUL t01,ALPHA,t01
+
+ MUL t04,ALPHA,t04
+ MUL t05,ALPHA,t05
+
+ MUL t08,ALPHA,t08
+ MUL t09,ALPHA,t09
+
+ addl CO,2*SIZE,CO # 2c
+ addl C1,2*SIZE,C1
+ addl C2,2*SIZE,C2
+ addl C3,2*SIZE,C3
+
+ ST t00,-2*SIZE(CO) # 2c
+ ST t01,-1*SIZE(CO)
+
+ MUL t12,ALPHA,t12
+ MUL t13,ALPHA,t13
+
+ ST t04,-2*SIZE(C1)
+ ST t05,-1*SIZE(C1)
+
+ ST t08,-2*SIZE(C2)
+ ST t09,-1*SIZE(C2)
+
+ ST t12,-2*SIZE(C3)
+ ST t13,-1*SIZE(C3)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 2, TEMP
+#else
+ subl TEMP, 4, TEMP
+#endif
+
+ sll TEMP, 1 + BASE_SHIFT,KC
+ sll TEMP, 2 + BASE_SHIFT,TEMP
+
+ addl A, KC, A
+ addl B, TEMP, B
+#endif
+
+#ifdef LEFT
+ addl KK,2,KK
+#endif
+#endif
+
+
+
+.align 5
+.L18: # nr=4,mr=1---------------------------
+ and MC1,1,MC
+ beq MC,$End_NC_Unroll4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+ nop
+#else
+ sll KK, BASE_SHIFT, KC # mr=1
+ sll KK, 2 + BASE_SHIFT,TEMP # nr=4
+
+ addl A, KC, A
+ addl B1,TEMP, B
+#endif
+
+ fclr t00 # clear 4 regitster
+ fclr t04
+ fclr t08
+ fclr t12
+
+ LD b0,0(B) # get 4b
+ LD b1,1*SIZE(B)
+ LD b2,2*SIZE(B)
+ LD b3,3*SIZE(B)
+
+ LD a0,0(A) # get 1 a
+
+ fillcs 0(CO) # prefetch C
+ fillcs 0(C1)
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 1, TEMP # mr=1
+#else
+ addl KK, 4,TEMP # nr=4
+#endif
+ sra TEMP,1,KC
+ beq KC,$Rest_1x4x1
+
+#else
+ mov B1,B # Reset B
+ fclr t00 # clear 4 regitster
+ fclr t04
+ fclr t08
+ fclr t12
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+
+ LD b0,0(B) # get 4b
+ LD b1,1*SIZE(B)
+ LD b2,2*SIZE(B)
+ LD b3,3*SIZE(B)
+
+ LD a0,0(A) # get 1 a
+
+ fillcs 0(CO) # prefetch C
+ fillcs 0(C1)
+ fillcs 0(C2)
+ fillcs 0(C3)
+
+ beq KC,$Rest_1x4x1
+
+#endif
+
+
+$Panel_1x4x2:
+ MAD a0,b0,t00,t00
+ MAD a0,b1,t04,t04
+ MAD a0,b2,t08,t08
+ MAD a0,b3,t12,t12
+
+ LD a8,1*SIZE(A)
+ LD nb0,4*SIZE(B)
+ LD nb1,5*SIZE(B)
+ LD nb2,6*SIZE(B)
+ LD nb3,7*SIZE(B)
+
+ addl B,8*SIZE,B # 4b*2k
+
+ subl KC,1,KC
+ MAD a8,nb0,t00,t00
+ MAD a8,nb1,t04,t04
+ MAD a8,nb2,t08,t08
+ MAD a8,nb3,t12,t12
+
+ addl A,2*SIZE,A # 1a*2k
+ LD a0,0(A) # get 3rd 4b and 1a
+ LD b0,0(B)
+ LD b1,1*SIZE(B)
+ LD b2,2*SIZE(B)
+ LD b3,3*SIZE(B)
+ bne KC,$Panel_1x4x2
+
+
+$Rest_1x4x1:
+ LD ALPHA,192($sp) # get alpha
+#ifndef TRMMKERNEL
+ blbc KC1, $Write_1x4
+#else
+ blbc TEMP, $Write_1x4
+#endif
+
+ addl A,1*SIZE,A # 1m*1k*8Byte
+ addl B,4*SIZE,B # 4n*1K*8Byte
+
+ MAD a0,b0,t00,t00
+ MAD a0,b1,t04,t04
+ MAD a0,b2,t08,t08
+ MAD a0,b3,t12,t12
+
+
+$Write_1x4:
+#ifndef TRMMKERNEL
+ LD c00,0(CO)
+ LD c04,0(C1)
+ MAD t00,ALPHA,c00,t00
+ MAD t04,ALPHA,c04,t04
+ LD c08,0(C2)
+ LD c12,0(C3)
+ MAD t08,ALPHA,c08,t08
+ MAD t12,ALPHA,c12,t12
+ ST t00,0(CO)
+ ST t04,0(C1)
+ ST t08,0(C2)
+ ST t12,0(C3)
+
+#else
+ MUL t00,ALPHA,t00
+ MUL t04,ALPHA,t04
+ MUL t08,ALPHA,t08
+ MUL t12,ALPHA,t12
+
+ ST t00,0(CO)
+ ST t04,0(C1)
+ ST t08,0(C2)
+ ST t12,0(C3)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 1, TEMP
+#else
+ subl TEMP, 4, TEMP
+#endif
+
+ sll TEMP, BASE_SHIFT, KC
+ sll TEMP, 2 + BASE_SHIFT, TEMP
+
+ addl A, KC, A
+ addl B, TEMP,B
+#endif
+
+#ifdef LEFT
+ addl KK, 1,KK
+#endif
+#endif
+
+
+ .align 5
+
+$End_NC_Unroll4:
+ subl NC,1,NC # Loop N --
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 4, KK
+ nop
+#endif
+ mov A1,A # Reset A
+ mov B, B1 # mov B1 to the next panel
+ bne NC,.L0
+
+
+
+
+ .align 5
+$Begin_NC_Unroll2:
+
+ and NC1, 2, NC
+ beq NC, $Begin_NC_Unroll1
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK # reset KK
+#endif
+
+ mov C,CO
+ addl C,LDM,C1
+
+ sra MC1,4,MC # MC=MC1/16
+ sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC
+
+ addl A1,SPANA,PREA
+ subl PREA,16*SIZE,PREA
+
+ addl C1,LDM,C # C=C1+LDM, Mov C to Next Panel
+ beq MC,.L25 # MC=0:MC1<16
+
+
+ .align 5
+.L2: # nr=2,mr=16-------------------
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA))\
+ || (!defined(LEFT) && !defined(TRANSA))
+ mov B1,B
+#else
+ sll KK, 4 + BASE_SHIFT,KC # mr=16
+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2
+
+ addl A,KC,A
+ addl B1,TEMP,B
+#endif
+
+ vcpys $f31,$f31,t00 # CLEAR Results Register
+ vcpys $f31,$f31,t01
+ vcpys $f31,$f31,t02
+ vcpys $f31,$f31,t03
+
+ LDDE b0,0(B)
+ LDDE b1,1*SIZE(B)
+
+ VLD a0,0(A) # Get 16 A and 2 B
+ VLD a4,4*SIZE(A)
+ VLD a8,8*SIZE(A)
+ VLD a12,12*SIZE(A)
+
+ vcpys $f31,$f31,t04
+ vcpys $f31,$f31,t06
+ vcpys $f31,$f31,t05
+ vcpys $f31,$f31,t07
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+ fillcs 8*SIZE(CO)
+ fillcs 8*SIZE(C1)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 16, TEMP # mr=16
+#else
+ addl KK, 2, TEMP # nr=2
+#endif
+ sra TEMP, 1, KC
+ nop
+ beq KC,$Rest_16x2x1
+
+#else
+
+ mov B1,B # Set B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ vcpys $f31,$f31,t00 # CLEAR Results Register
+ vcpys $f31,$f31,t01
+ vcpys $f31,$f31,t02
+ vcpys $f31,$f31,t03
+
+ LDDE b0,0(B)
+ LDDE b1,1*SIZE(B)
+
+ VLD a0,0(A) # Get 16 A and 2 B
+ VLD a4,4*SIZE(A)
+ VLD a8,8*SIZE(A)
+ VLD a12,12*SIZE(A)
+
+ vcpys $f31,$f31,t04
+ vcpys $f31,$f31,t06
+ vcpys $f31,$f31,t05
+ vcpys $f31,$f31,t07
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+ fillcs 8*SIZE(CO)
+ fillcs 8*SIZE(C1)
+
+ beq KC,$Rest_16x2x1
+
+#endif
+
+
+$Panel_16x2x2:
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+
+ addl A,16*SIZE,A # 16m*1k
+ LDDE nb0,2*SIZE(B)
+ LDDE nb1,3*SIZE(B)
+
+ VMAD a4,b0,t01,t01
+ VMAD a4,b1,t05,t05
+
+ addl B,4*SIZE,B # 2n*2k
+ VLD na0,0(A)
+ VLD na4,4*SIZE(A)
+ VLD na8,8*SIZE(A)
+ VLD na12,12*SIZE(A)
+
+ VMAD a8,b0,t02,t02
+ VMAD a8,b1,t06,t06
+
+ VMAD a12,b0,t03,t03
+ VMAD a12,b1,t07,t07
+
+ fillcs 0(PREA)
+ fillcs 8*SIZE(PREA)
+ subl PREA,16*SIZE,PREA
+
+ subl KC,1,KC
+ VMAD na0,nb0,t00,t00
+ VMAD na0,nb1,t04,t04
+
+ addl A,16*SIZE,A # 16m*1k
+ LDDE b0,0(B)
+ LDDE b1,1*SIZE(B)
+
+ VMAD na4,nb0,t01,t01
+ VMAD na4,nb1,t05,t05
+
+ VLD a0,0(A) # get 3rd 16a
+ VLD a4,4*SIZE(A)
+ VLD a8,8*SIZE(A)
+ VLD a12,12*SIZE(A)
+
+ VMAD na8,nb0,t02,t02
+ VMAD na8,nb1,t06,t06
+
+ VMAD na12,nb0,t03,t03
+ VMAD na12,nb1,t07,t07
+
+ fillcs 0(PREA)
+ fillcs 8*SIZE(PREA)
+ subl PREA,16*SIZE,PREA
+ bne KC,$Panel_16x2x2
+
+
+$Rest_16x2x1:
+ LDDE ALPHA, 192($sp) # get alpha
+#ifndef TRMMKERNEL
+ blbc KC1, $Write_16x2
+#else
+ blbc TEMP, $Write_16x2
+#endif
+
+ addl A,16*SIZE,A # 16m*1k
+ addl B,2*SIZE,B # 2n*1k
+
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+
+ fillcs 0(PREA)
+ fillcs 8*SIZE(PREA)
+ subl PREA,16*SIZE,PREA
+
+ VMAD a4,b0,t01,t01
+ VMAD a4,b1,t05,t05
+ VMAD a8,b0,t02,t02
+ VMAD a8,b1,t06,t06
+ VMAD a12,b0,t03,t03
+ VMAD a12,b1,t07,t07
+
+
+$Write_16x2:
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_16x2
+
+$Align_CO_Access_16x2:
+ VLD c00,0(CO) # get 1st colum of 16c
+ VLD c01,4*SIZE(CO)
+ VLD c02,8*SIZE(CO)
+ VLD c03,12*SIZE(CO)
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+ VMAD t02,ALPHA,c02,t02
+ VMAD t03,ALPHA,c03,t03
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ VST t02,8*SIZE(CO)
+ VST t03,12*SIZE(CO)
+ jmp $Access_C1_16x2
+
+$UnAlign_CO_Access_16x2:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c04, 1*VEC_LEN*SIZE(CO)
+
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
+ VLD_UH c05, 2*VEC_LEN*SIZE(CO)
+
+ VLD_UL c02, 2*VEC_LEN*SIZE(CO)
+ VLD_UH c06, 3*VEC_LEN*SIZE(CO)
+
+ VLD_UL c03, 3*VEC_LEN*SIZE(CO)
+ VLD_UH c07, 4*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c04,c00
+ vbisw c01,c05,c01
+ vbisw c02,c06,c02
+ vbisw c03,c07,c03
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+ VMAD t02,ALPHA,c02,t02
+ VMAD t03,ALPHA,c03,t03
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
+
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_16x2:
+ and C1, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_C1_Access_16x2
+
+$Align_C1_Access_16x2:
+ VLD c04,0(C1)
+ VLD c05,4*SIZE(C1)
+ VLD c06,8*SIZE(C1)
+ VLD c07,12*SIZE(C1)
+
+ VMAD t04,ALPHA,c04,t04
+ VMAD t05,ALPHA,c05,t05
+ VMAD t06,ALPHA,c06,t06
+ VMAD t07,ALPHA,c07,t07
+
+ VST t04,0(C1)
+ VST t05,4*SIZE(C1)
+ VST t06,8*SIZE(C1)
+ VST t07,12*SIZE(C1)
+ jmp $End_NC_Unroll2
+
+$UnAlign_C1_Access_16x2:
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
+ VLD_UH t00, 1*VEC_LEN*SIZE(C1)
+
+ VLD_UL c05, 1*VEC_LEN*SIZE(C1)
+ VLD_UH t01, 2*VEC_LEN*SIZE(C1)
+
+ VLD_UL c06, 2*VEC_LEN*SIZE(C1)
+ VLD_UH t02, 3*VEC_LEN*SIZE(C1)
+
+ VLD_UL c07, 3*VEC_LEN*SIZE(C1)
+ VLD_UH t03, 4*VEC_LEN*SIZE(C1)
+
+ vbisw c04,t00,c04
+ vbisw c05,t01,c05
+ vbisw c06,t02,c06
+ vbisw c07,t03,c07
+
+ VMAD t04,ALPHA,c04,t04
+ VMAD t05,ALPHA,c05,t05
+ VMAD t06,ALPHA,c06,t06
+ VMAD t07,ALPHA,c07,t07
+
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
+
+ VST_UL t06, 2*VEC_LEN*SIZE(C1)
+ VST_UH t06, 3*VEC_LEN*SIZE(C1)
+
+ VST_UL t07, 3*VEC_LEN*SIZE(C1)
+ VST_UH t07, 4*VEC_LEN*SIZE(C1)
+ jmp $End_NC_Unroll2 # loop m finished
+
+
+#else
+
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_16x2
+
+$Align_CO_Access_16x2:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+ VMUL t02,ALPHA,t02
+ VMUL t03,ALPHA,t03
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ VST t02,8*SIZE(CO)
+ VST t03,12*SIZE(CO)
+ jmp $Access_C1_16x2
+
+$UnAlign_CO_Access_16x2:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+ VMUL t02,ALPHA,t02
+ VMUL t03,ALPHA,t03
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
+
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_16x2:
+ and C1, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_C1_Access_16x2
+
+$Align_C1_Access_16x2:
+ VMUL t04,ALPHA,t04
+ VMUL t05,ALPHA,t05
+ VMUL t06,ALPHA,t06
+ VMUL t07,ALPHA,t07
+
+ VST t04,0(C1)
+ VST t05,4*SIZE(C1)
+ VST t06,8*SIZE(C1)
+ VST t07,12*SIZE(C1)
+ jmp $TRMMKERNEL_16x2
+
+$UnAlign_C1_Access_16x2:
+ VMUL t04,ALPHA,t04
+ VMUL t05,ALPHA,t05
+ VMUL t06,ALPHA,t06
+ VMUL t07,ALPHA,t07
+
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
+
+ VST_UL t06, 2*VEC_LEN*SIZE(C1)
+ VST_UH t06, 3*VEC_LEN*SIZE(C1)
+
+ VST_UL t07, 3*VEC_LEN*SIZE(C1)
+ VST_UH t07, 4*VEC_LEN*SIZE(C1)
+
+$TRMMKERNEL_16x2:
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 16, TEMP
+#else
+ subl TEMP, 2, TEMP
+#endif
+
+ sll TEMP, 4 + BASE_SHIFT,KC
+ sll TEMP, 1 + BASE_SHIFT,TEMP
+
+ addl A, KC, A
+ addl B, TEMP,B
+#endif
+
+#ifdef LEFT
+ addl KK, 16, KK
+ nop
+#endif
+
+ jmp $End_NC_Unroll2 # loop m finished
+#endif
+
+
+
+ .align 5
+
+.L25:
+ and MC1,8,MC
+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc
+ nop
+ beq MC,.L26
+
+ addl A1,SPANA,PREA
+ subl PREA,8*SIZE,PREA # PREA-=MC
+
+
+ .align 5
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA))\
+ || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+#else
+ sll KK, 3 + BASE_SHIFT,KC # mr=8
+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2
+
+ addl A,KC, A
+ addl B1,TEMP,B
+#endif
+
+ vcpys $f31,$f31,t00 # clear 16 registers
+ vcpys $f31,$f31,t01
+
+ LDDE b0,0(B) # Get 2b
+ LDDE b1,1*SIZE(B)
+
+ vcpys $f31,$f31,t04
+ vcpys $f31,$f31,t05
+
+ VLD a0,0(A) # Get 8a
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+ fillcs 4*SIZE(CO)
+ fillcs 4*SIZE(C1)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 8, TEMP # mr=8
+#else
+ addl KK, 2, TEMP # nr=2
+#endif
+ sra TEMP, 1,KC
+ nop
+ beq KC,$Rest_8x2x1
+
+#else
+
+ mov B1, B
+ sra KC1,1,KC
+ vcpys $f31,$f31,t00 # clear 16 registers
+ vcpys $f31,$f31,t01
+
+ LDDE b0,0(B) # Get 2b
+ LDDE b1,1*SIZE(B)
+
+ vcpys $f31,$f31,t04
+ vcpys $f31,$f31,t05
+
+ VLD a0,0(A) # Get 8a
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+ fillcs 4*SIZE(CO)
+ fillcs 4*SIZE(C1)
+
+ beq KC,$Rest_8x2x1
+#endif
+
+
+$Panel_8x2x2:
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+
+ LDDE nb0,2*SIZE(B) # get next 2b
+ LDDE nb1,3*SIZE(B)
+
+ VMAD a4,b0,t01,t01
+ VMAD a4,b1,t05,t05
+
+ addl B,4*SIZE,B # 2n*2k
+ VLD na8,8*SIZE(A) # get next 8a
+ VLD na12,12*SIZE(A)
+
+ fillcs 0(PREA)
+ fillcs 4*SIZE(PREA)
+ subl PREA,8*SIZE,PREA
+
+ subl KC,1,KC
+ VMAD na8,nb0,t00,t00
+ VMAD na8,nb1,t04,t04
+
+ addl A,16*SIZE,A # 8m*2k
+ LDDE b0,0(B)
+ LDDE b1,1*SIZE(B) # get 3rd 2b
+
+ VMAD na12,nb0,t01,t01
+ VMAD na12,nb1,t05,t05
+
+ VLD a0,0(A) # get 3rd 8a
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(PREA)
+ fillcs 4*SIZE(PREA)
+ subl PREA,8*SIZE,PREA
+ bne KC,$Panel_8x2x2
+
+
+$Rest_8x2x1:
+ LDDE ALPHA,192($sp) # get alpha
+#ifndef TRMMKERNEL
+ blbc KC1,$Write_8x2
+#else
+ blbc TEMP,$Write_8x2
+#endif
+
+ addl A,8*SIZE,A # 8m*1k
+ addl B,2*SIZE,B # 2n*1K
+
+ fillcs 0(PREA)
+ fillcs 4*SIZE(PREA)
+ subl PREA,8*SIZE,PREA
+
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+ VMAD a4,b0,t01,t01
+ VMAD a4,b1,t05,t05
+
+
+$Write_8x2:
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_8x2
+
+$Align_CO_Access_8x2:
+ VLD c00,0(CO) # get 1st colum of 16c
+ VLD c01,4*SIZE(CO)
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ jmp $Access_C1_8x2
+
+$UnAlign_CO_Access_8x2:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c02, 1*VEC_LEN*SIZE(CO)
+
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
+ VLD_UH c03, 2*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c02,c00
+ vbisw c01,c03,c01
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_8x2:
+ and C1, (VEC_LEN*SIZE-1),$6
+ addl CO,8*SIZE,CO # 8c
+ nop
+ bne $6,$UnAlign_C1_Access_8x2
+
+$Align_C1_Access_8x2:
+ VLD c04,0(C1)
+ VLD c05,4*SIZE(C1)
+
+ VMAD t04,ALPHA,c04,t04
+ VMAD t05,ALPHA,c05,t05
+
+ VST t04,0(C1)
+ VST t05,4*SIZE(C1)
+ addl C1,8*SIZE,C1
+ jmp .L26
+
+$UnAlign_C1_Access_8x2:
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
+ VLD_UH c06, 1*VEC_LEN*SIZE(C1)
+
+ VLD_UL c05, 1*VEC_LEN*SIZE(C1)
+ VLD_UH c07, 2*VEC_LEN*SIZE(C1)
+
+ vbisw c04,c06,c04
+ vbisw c05,c07,c05
+
+ VMAD t04,ALPHA,c04,t04
+ VMAD t05,ALPHA,c05,t05
+
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
+ addl C1,8*SIZE,C1
+
+#else
+
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_8x2
+
+$Align_CO_Access_8x2:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ jmp $Access_C1_8x2
+
+$UnAlign_CO_Access_8x2:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_8x2:
+ and C1, (VEC_LEN*SIZE-1),$6
+ addl CO,8*SIZE,CO # 8c
+ nop
+ bne $6,$UnAlign_C1_Access_8x2
+
+$Align_C1_Access_8x2:
+ VMUL t04,ALPHA,t04
+ VMUL t05,ALPHA,t05
+
+ VST t04,0(C1)
+ VST t05,4*SIZE(C1)
+ addl C1,8*SIZE,C1
+ jmp $TRMMKERNEL_8x2
+
+$UnAlign_C1_Access_8x2:
+ VMUL t04,ALPHA,t04
+ VMUL t05,ALPHA,t05
+
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+
+ VST_UL t05, 1*VEC_LEN*SIZE(C1)
+ VST_UH t05, 2*VEC_LEN*SIZE(C1)
+ addl C1,8*SIZE,C1
+
+$TRMMKERNEL_8x2:
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK,TEMP
+#ifdef LEFT
+ subl TEMP, 8,TEMP # mr=8
+#else
+ subl TEMP, 2,TEMP # nr=2
+#endif
+
+ sll TEMP, 3 + BASE_SHIFT,KC
+ sll TEMP, 1 + BASE_SHIFT,TEMP
+
+ addl A,KC,A
+ addl B,TEMP,B
+#endif
+
+#ifdef LEFT
+ addl KK,8,KK
+ nop
+#endif
+#endif
+
+
+
+ .align 5
+
+.L26: # nr=2,mr=4------------------
+ and MC1,4,MC # MC1&4
+ beq MC,.L27
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+ nop
+#else
+ sll KK, 2 + BASE_SHIFT,KC # mr=4
+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2
+
+ addl A,KC,A
+ addl B1,TEMP,B
+#endif
+
+ vcpys $f31,$f31,t00 # clear 2vector registers
+ vcpys $f31,$f31,t04
+
+ LDDE b0,0(B) # get 2b
+ LDDE b1,1*SIZE(B)
+
+ VLD a0,0(A) # Get 4 a
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 4, TEMP
+#else
+ addl KK, 2, TEMP
+#endif
+ sra TEMP,1,KC
+ beq KC,$Rest_4x2x1
+
+#else
+
+ mov B1,B
+ sra KC1,1,KC
+ vcpys $f31,$f31,t00 # clear 2vector registers
+ vcpys $f31,$f31,t04
+
+ LDDE b0,0(B) # get 2b
+ LDDE b1,1*SIZE(B)
+
+ VLD a0,0(A) # Get 4 a
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+
+ beq KC,$Rest_4x2x1
+#endif
+
+$Panel_4x2x2:
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+
+ LDDE nb0,2*SIZE(B) # get next 2b
+ LDDE nb1,3*SIZE(B)
+
+ addl B,4*SIZE,B # 2n*2K
+ VLD a4,4*SIZE(A) # get next 4a
+
+ subl KC,1,KC
+ VMAD a4,nb0,t00,t00
+ VMAD a4,nb1,t04,t04
+
+ addl A,8*SIZE,A # 4m*2k
+ LDDE b0,0(B) # get 3rd 2b
+ LDDE b1,1*SIZE(B)
+
+ VLD a0,0(A) # get 3rd 4a
+ bne KC,$Panel_4x2x2
+
+
+$Rest_4x2x1:
+ LDDE ALPHA,192($sp) # get alpha
+#ifndef TRMMKERNEL
+ blbc KC1,$Write_4x2
+#else
+ blbc TEMP,$Write_4x2
+#endif
+
+ addl A,4*SIZE,A # 4m*1k
+ addl B,2*SIZE,B # 2n*1K
+
+ VMAD a0,b0,t00,t00
+ VMAD a0,b1,t04,t04
+
+
+$Write_4x2:
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_4x2
+
+$Align_CO_Access_4x2:
+ VLD c00,0(CO) # get 1st colum of 16c
+ VMAD t00,ALPHA,c00,t00
+ VST t00,0(CO)
+ jmp $Access_C1_4x2
+
+$UnAlign_CO_Access_4x2:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c01, 1*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c01,c00
+
+ VMAD t00,ALPHA,c00,t00
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_4x2:
+ and C1, (VEC_LEN*SIZE-1),$6
+ addl CO,4*SIZE,CO # 4c
+ nop
+ bne $6,$UnAlign_C1_Access_4x2
+
+$Align_C1_Access_4x2:
+ VLD c04,0(C1)
+ VMAD t04,ALPHA,c04,t04
+ VST t04,0(C1)
+ addl C1,4*SIZE,C1
+ jmp .L27
+
+$UnAlign_C1_Access_4x2:
+ VLD_UL c04, 0*VEC_LEN*SIZE(C1)
+ VLD_UH c05, 1*VEC_LEN*SIZE(C1)
+
+ vbisw c04,c05,c04
+
+ VMAD t04,ALPHA,c04,t04
+
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+ addl C1,4*SIZE,C1
+
+#else
+
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_4x2
+
+$Align_CO_Access_4x2:
+ VMUL t00,ALPHA,t00
+ VST t00,0(CO)
+ jmp $Access_C1_4x2
+
+$UnAlign_CO_Access_4x2:
+ VMUL t00,ALPHA,t00
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+
+$Access_C1_4x2:
+ and C1, (VEC_LEN*SIZE-1),$6
+ addl CO,4*SIZE,CO # 4c
+ nop
+ bne $6,$UnAlign_C1_Access_4x2
+
+$Align_C1_Access_4x2:
+ VMUL t04,ALPHA,t04
+ VST t04,0(C1)
+ addl C1,4*SIZE,C1
+ jmp $TRMMKERNEL_4x2
+
+$UnAlign_C1_Access_4x2:
+ VMUL t04,ALPHA,t04
+ VST_UL t04, 0*VEC_LEN*SIZE(C1)
+ VST_UH t04, 1*VEC_LEN*SIZE(C1)
+ addl C1,4*SIZE,C1
+
+$TRMMKERNEL_4x2:
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 4, TEMP
+#else
+ subl TEMP, 2, TEMP
+#endif
+
+ sll TEMP, 2 + BASE_SHIFT,KC
+ sll TEMP, 1 + BASE_SHIFT,TEMP
+
+ addl A, KC, A
+ addl B, TEMP, B
+#endif
+
+#ifdef LEFT
+ addl KK, 4, KK
+ nop
+#endif
+#endif
+
+
+
+ .align 5
+
+.L27: # nr=2,mr=2--------------
+ and MC1,2,MC
+ beq MC,.L28
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+#else
+ sll KK, 1 + BASE_SHIFT,KC # mr=nr=2
+ nop
+ addl A,KC,A
+ addl B1,KC,B
+#endif
+
+ fclr t00 # clear 4 register
+ fclr t01
+ fclr t04
+ fclr t05
+
+ LD b0,0(B) # get 2b
+ LD b1,1*SIZE(B)
+
+ LD a0,0(A) # get 2a
+ LD a4,1*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#else
+ addl KK, 2, TEMP # mr=nr=2
+#endif
+ sra TEMP,1, KC
+ nop
+ nop
+ beq KC,$Rest_2x2x1
+
+#else
+
+ mov B1,B # Reset B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ fclr t00 # clear 4 register
+ fclr t01
+ fclr t04
+ fclr t05
+
+ LD b0,0(B) # get 2b
+ LD b1,1*SIZE(B)
+
+ LD a0,0(A) # get 2a
+ LD a4,1*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 0(C1)
+ beq KC,$Rest_2x2x1
+
+#endif
+
+
+$Panel_2x2x2:
+ MAD a0,b0,t00,t00
+ MAD a0,b1,t04,t04
+
+ LD nb0,2*SIZE(B) # get next 2b
+ LD nb1,3*SIZE(B)
+
+ MAD a4,b0,t01,t01
+ MAD a4,b1,t05,t05
+
+ addl B,4*SIZE,B # 2(n)*2(k)
+ LD a8,2*SIZE(A) # get next 2a
+ LD a12,3*SIZE(A)
+
+ subl KC,1,KC
+ MAD a8,nb0,t00,t00
+ MAD a8,nb1,t04,t04
+
+ addl A,4*SIZE,A # 2m*2k
+ LD b0,0(B)
+ LD b1,1*SIZE(B)
+
+ MAD a12,nb0,t01,t01
+ MAD a12,nb1,t05,t05
+
+ LD a0,0(A)
+ LD a4,1*SIZE(A)
+ bne KC,$Panel_2x2x2
+
+
+$Rest_2x2x1:
+ LD ALPHA,192($sp) # Get ALPHA
+#ifndef TRMMKERNEL
+ blbc KC1,$Write_2x2
+#else
+ blbc TEMP,$Write_2x2
+#endif
+
+ addl A,2*SIZE,A # 2m*1k
+ addl B,2*SIZE,B # 2n*1K
+
+ MAD a0,b0,t00,t00
+ MAD a0,b1,t04,t04
+ MAD a4,b0,t01,t01
+ MAD a4,b1,t05,t05
+
+
+$Write_2x2:
+
+#ifndef TRMMKERNEL
+ LD c00,0(CO)
+ LD c04,0(C1)
+ LD c01,1*SIZE(CO)
+ LD c05,1*SIZE(C1)
+
+ MAD t00,ALPHA,c00,t00
+ MAD t04,ALPHA,c04,t04
+ MAD t01,ALPHA,c01,t01
+ MAD t05,ALPHA,c05,t05
+
+ ST t00,0(CO)
+ ST t04,0(C1)
+ ST t01,1*SIZE(CO)
+ ST t05,1*SIZE(C1)
+
+ addl CO,2*SIZE,CO # 2c
+ addl C1,2*SIZE,C1
+
+#else
+
+ MUL t00,ALPHA,t00
+ MUL t04,ALPHA,t04
+ MUL t01,ALPHA,t01
+ MUL t05,ALPHA,t05
+
+ ST t00,0(CO)
+ ST t04,0(C1)
+ ST t01,1*SIZE(CO)
+ ST t05,1*SIZE(C1)
+
+ addl CO,2*SIZE,CO # 2c
+ addl C1,2*SIZE,C1
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+ subl TEMP, 2, TEMP
+
+ sll TEMP, 1 + BASE_SHIFT, KC
+ nop
+
+ addl A,KC, A
+ addl B,KC, B
+#endif
+
+#ifdef LEFT
+ addl KK, 2, KK
+#endif
+#endif
+
+
+
+ .align 5
+.L28:
+ and MC1,1,MC # nr=2,mr=1-------------------
+ beq MC,$End_NC_Unroll2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+#else
+ sll KK, BASE_SHIFT,KC # mr=1
+ sll KK, 1 + BASE_SHIFT,TEMP # nr=2
+
+ addl A,KC,A
+ addl B1,TEMP,B
+#endif
+
+ fclr t00 # clear 2 registers
+ fclr t04
+
+ LD b0,0(B) # 2b
+ LD b1,1*SIZE(B)
+
+ LD a0,0(A) # 1a
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 1, TEMP
+#else
+ addl KK, 2, TEMP
+#endif
+ sra TEMP,1,KC
+ nop
+ beq KC,$Rest_1x2x1
+
+#else
+ mov B1,B # Reset B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ fclr t00 # clear 2 registers
+ fclr t04
+
+ LD b0,0(B) # 2b
+ LD b1,1*SIZE(B)
+
+ LD a0,0(A) # 1a
+ beq KC,$Rest_1x2x1
+#endif
+
+
+ .align 5
+
+$Panel_1x2x2:
+ MAD a0,b0,t00,t00
+ MAD a0,b1,t04,t04
+
+ LD nb0,2*SIZE(B) # get next 2b
+ LD nb1,3*SIZE(B)
+
+ addl B,4*SIZE,B # 2(n)*2(k)
+ LD a8,1*SIZE(A) # get next 1a
+
+ subl KC,1,KC
+ MAD a8,nb0,t00,t00
+ MAD a8,nb1,t04,t04
+
+ addl A,2*SIZE,A # 1m*2k
+ LD b0,0(B) # get 3rd 2b
+ LD b1,1*SIZE(B)
+
+ LD a0,0(A) # get 3rd 1a
+ bne KC,$Panel_1x2x2
+
+
+$Rest_1x2x1:
+ LD ALPHA,192($sp) # Get ALPHA
+#ifndef TRMMKERNEL
+ blbc KC1,$Write_1x2
+#else
+ blbc TEMP,$Write_1x2
+#endif
+
+ addl A,1*SIZE,A # 1m*1k
+ addl B,2*SIZE,B # 2n*1K
+
+ MAD a0,b0,t00,t00
+ MAD a0,b1,t04,t04
+
+
+$Write_1x2: # Write back 2 results
+#ifndef TRMMKERNEL
+ LD c00,0(CO)
+ LD c04,0(C1)
+
+ MAD t00,ALPHA,c00,t00
+ MAD t04,ALPHA,c04,t04
+
+ ST t00,0(CO)
+ ST t04,0(C1)
+
+#else
+
+ MUL t00,ALPHA,t00
+ MUL t04,ALPHA,t04
+
+ ST t00,0(CO)
+ ST t04,0(C1)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 1,TEMP
+#else
+ subl TEMP, 2,TEMP
+#endif
+
+ sll TEMP, BASE_SHIFT,KC
+ sll TEMP, 1 + BASE_SHIFT,TEMP
+
+ addl A,KC,A
+ addl B,TEMP,B
+#endif
+
+#ifdef LEFT
+ addl KK,1,KK
+#endif
+#endif
+
+
+ .align 5
+
+$End_NC_Unroll2:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 2,KK
+#endif
+ mov B, B1
+
+
+ .align 5
+$Begin_NC_Unroll1: # Nr=1
+ and NC1,1,NC # NC=NC1&1
+ beq NC,$Kernel_End
+
+ mov A1,A # Reset A
+ mov C,CO # Reset C
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET,KK # reset offset
+#endif
+
+ sll KC1,4+BASE_SHIFT,SPANA # SPANA=KC1*MC
+ subl PREA,16*SIZE,PREA
+
+ sra MC1,4,MC # MC=MC1/16
+ beq MC,.L35 # MC=0:MC1<16
+
+
+.L3: # nr=1,mr=16
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1,B
+#else
+ sll KK, 4 + BASE_SHIFT, KC # mr=16
+ sll KK, BASE_SHIFT,TEMP # nr=1
+
+ addl A,KC,A
+ addl B1,TEMP,B
+#endif
+
+ vcpys $f31,$f31,t00 # CLEAR 16 Register
+ vcpys $f31,$f31,t01
+ vcpys $f31,$f31,t02
+ vcpys $f31,$f31,t03
+
+ LDDE b0,0(B) # get 1b and 16a
+
+ VLD a0,0(A)
+ VLD a4,4*SIZE(A)
+ VLD a8,8*SIZE(A)
+ VLD a12,12*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 4*SIZE(CO)
+ fillcs 8*SIZE(CO)
+ fillcs 12*SIZE(CO)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 16, TEMP
+#else
+ addl KK, 1, TEMP
+#endif
+ sra TEMP, 1, KC
+ beq KC,$Rest_16x1x1
+
+#else
+
+ mov B1,B # Set B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ vcpys $f31,$f31,t00 # CLEAR 16 Register
+ vcpys $f31,$f31,t01
+ vcpys $f31,$f31,t02
+ vcpys $f31,$f31,t03
+
+ LDDE b0,0(B) # get 1b and 16a
+
+ VLD a0,0(A)
+ VLD a4,4*SIZE(A)
+ VLD a8,8*SIZE(A)
+ VLD a12,12*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 4*SIZE(CO)
+ fillcs 8*SIZE(CO)
+ fillcs 12*SIZE(CO)
+
+ beq KC,$Rest_16x1x1
+
+#endif
+
+$Panel_16x1x2:
+ addl A,16*SIZE,A # 16(m)*1(k)
+ LDDE b1,1*SIZE(B) # get next 1b
+
+ VMAD a0,b0,t00,t00
+ VMAD a4,b0,t01,t01
+
+ addl B,2*SIZE,B # 1(n)*2(k)
+ VLD na0,0(A) # get next 16a
+ VLD na4,4*SIZE(A)
+ VLD na8,8*SIZE(A)
+ VLD na12,12*SIZE(A)
+
+ VMAD a8,b0,t02,t02
+ VMAD a12,b0,t03,t03
+
+ subl KC,1,KC
+ addl A,16*SIZE,A # 16m*1k
+ LDDE b0,0(B)
+
+ VMAD na0,b1,t00,t00
+ VMAD na4,b1,t01,t01
+
+ VLD a0,0(A)
+ VLD a4,4*SIZE(A)
+ VLD a8,8*SIZE(A)
+ VLD a12,12*SIZE(A)
+
+ VMAD na8,b1,t02,t02
+ VMAD na12,b1,t03,t03
+ bne KC,$Panel_16x1x2
+
+
+$Rest_16x1x1:
+ LDDE ALPHA,192($sp)
+#ifndef TRMMKERNEL
+ blbc KC1,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1
+#else
+ blbc TEMP,$Write_16x1 # If(KC1[0]==0) goto $Write_16x1
+#endif
+
+ addl A,16*SIZE,A # 16a*1k
+ addl B,1*SIZE,B # 1b*1k
+
+ VMAD a0,b0,t00,t00
+ VMAD a4,b0,t01,t01
+ VMAD a8,b0,t02,t02
+ VMAD a12,b0,t03,t03
+
+
+$Write_16x1:
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_16x1
+
+$Align_CO_Access_16x1:
+ VLD c00,0(CO) # get 1st colum of 16c
+ VLD c01,4*SIZE(CO)
+ VLD c02,8*SIZE(CO)
+ VLD c03,12*SIZE(CO)
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+ VMAD t02,ALPHA,c02,t02
+ VMAD t03,ALPHA,c03,t03
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ VST t02,8*SIZE(CO)
+ VST t03,12*SIZE(CO)
+ jmp $Kernel_End
+
+$UnAlign_CO_Access_16x1:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c04, 1*VEC_LEN*SIZE(CO)
+
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
+ VLD_UH c05, 2*VEC_LEN*SIZE(CO)
+
+ VLD_UL c02, 2*VEC_LEN*SIZE(CO)
+ VLD_UH c06, 3*VEC_LEN*SIZE(CO)
+
+ VLD_UL c03, 3*VEC_LEN*SIZE(CO)
+ VLD_UH c07, 4*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c04,c00
+ vbisw c01,c05,c01
+ vbisw c02,c06,c02
+ vbisw c03,c07,c03
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+ VMAD t02,ALPHA,c02,t02
+ VMAD t03,ALPHA,c03,t03
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
+
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
+ jmp $Kernel_End
+
+#else
+
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_16x1
+
+$Align_CO_Access_16x1:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+ VMUL t02,ALPHA,t02
+ VMUL t03,ALPHA,t03
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ VST t02,8*SIZE(CO)
+ VST t03,12*SIZE(CO)
+ jmp $TRMMKERNEL_16x1
+
+$UnAlign_CO_Access_16x1:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+ VMUL t02,ALPHA,t02
+ VMUL t03,ALPHA,t03
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+ VST_UL t02, 2*VEC_LEN*SIZE(CO)
+ VST_UH t02, 3*VEC_LEN*SIZE(CO)
+
+ VST_UL t03, 3*VEC_LEN*SIZE(CO)
+ VST_UH t03, 4*VEC_LEN*SIZE(CO)
+
+$TRMMKERNEL_16x1:
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 16, TEMP
+#else
+ subl TEMP, 1,TEMP
+#endif
+
+ sll TEMP, 4 + BASE_SHIFT,KC
+ sll TEMP, BASE_SHIFT, TEMP
+
+ addl A,KC,A
+ addl B,TEMP,B
+#endif
+
+#ifdef LEFT
+ addl KK, 16, KK
+ nop
+#endif
+
+ jmp $Kernel_End
+#endif
+
+
+
+ .align 5
+.L35: # nr=1,mr=8------------------
+ and MC1,8,MC
+ sll KC1,3+BASE_SHIFT,SPANA # spana=kc1*mc
+ nop
+ beq MC,.L36 # MC1<8
+
+ addl A1,SPANA,PREA
+ subl PREA,8*SIZE,PREA # PREA-=MC
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+#else
+ sll KK, 3 + BASE_SHIFT,KC # mr=8
+ sll KK, BASE_SHIFT,TEMP # nr=1
+
+ addl A,KC, A
+ addl B1,TEMP,B
+#endif
+
+ vcpys $f31,$f31,t00 # CLEAR 8Register
+ vcpys $f31,$f31,t01
+
+ LDDE b0,0(B) # get 1b
+
+ VLD a0,0(A) # get 8a
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 4*SIZE(CO)
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK,TEMP
+#elif defined(LEFT)
+ addl KK, 8,TEMP
+#else
+ addl KK, 1,TEMP
+#endif
+ sra TEMP,1,KC
+ nop
+ beq KC,$Rest_8x1x1
+
+#else
+
+ mov B1, B
+ sra KC1,1,KC
+ vcpys $f31,$f31,t00 # CLEAR 8Register
+ vcpys $f31,$f31,t01
+
+ LDDE b0,0(B) # get 1b
+
+ VLD a0,0(A) # get 8a
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ fillcs 4*SIZE(CO)
+ beq KC,$Rest_8x1x1
+
+#endif
+
+
+$Panel_8x1x2:
+ VMAD a0,b0,t00,t00
+ VMAD a4,b0,t01,t01
+
+ LDDE nb0,1*SIZE(B) # get next 1b
+
+ addl B,2*SIZE,B # 1(n)*2k
+ VLD na8,8*SIZE(A) # get next 8a
+ VLD na12,12*SIZE(A)
+
+ fillcs 0(PREA)
+ subl PREA,8*SIZE,PREA
+
+ subl KC,1,KC
+ VMAD na8,nb0,t00,t00
+ VMAD na12,nb0,t01,t01
+
+ addl A,16*SIZE,A # 8m*2k
+ LDDE b0,0(B) # get 3rd 1b
+
+ VLD a0,0(A) # get 3rd 8a
+ VLD a4,4*SIZE(A)
+
+ fillcs 0(PREA)
+ subl PREA,8*SIZE,PREA
+ bne KC,$Panel_8x1x2
+
+
+$Rest_8x1x1:
+ LDDE ALPHA,192($sp) # Get ALPHA
+#ifndef TRMMKERNEL
+ blbc KC1,$Write_8x1
+#else
+ blbc TEMP,$Write_8x1
+#endif
+
+ addl A,8*SIZE,A # 8m*1k
+ addl B,1*SIZE,B # 1n*1k
+
+ VMAD a0,b0,t00,t00
+ VMAD a4,b0,t01,t01
+
+
+$Write_8x1:
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_8x1
+
+$Align_CO_Access_8x1:
+ VLD c00,0(CO) # get 1st colum of 16c
+ VLD c01,4*SIZE(CO)
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ addl CO,8*SIZE,CO # 8c
+ jmp .L36
+
+$UnAlign_CO_Access_8x1:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c02, 1*VEC_LEN*SIZE(CO)
+
+ VLD_UL c01, 1*VEC_LEN*SIZE(CO)
+ VLD_UH c03, 2*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c02,c00
+ vbisw c01,c03,c01
+
+ VMAD t00,ALPHA,c00,t00
+ VMAD t01,ALPHA,c01,t01
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+ addl CO,8*SIZE,CO # 8c
+
+#else
+
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_8x1
+
+$Align_CO_Access_8x1:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+
+ VST t00,0(CO)
+ VST t01,4*SIZE(CO)
+ jmp $TRMMKERNEL_8x1
+
+$UnAlign_CO_Access_8x1:
+ VMUL t00,ALPHA,t00
+ VMUL t01,ALPHA,t01
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+ VST_UL t01, 1*VEC_LEN*SIZE(CO)
+ VST_UH t01, 2*VEC_LEN*SIZE(CO)
+
+$TRMMKERNEL_8x1:
+ addl CO,8*SIZE,CO # 8c
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 8, TEMP
+#else
+ subl TEMP, 1, TEMP
+#endif
+
+ sll TEMP, 3 + BASE_SHIFT, KC
+ sll TEMP, BASE_SHIFT,TEMP
+
+ addl A,KC, A
+ addl B,TEMP,B
+#endif
+
+#ifdef LEFT
+ addl KK,8, KK
+#endif
+#endif
+
+
+
+ .align 5
+.L36: # nr=1,mr=4---------------
+ and MC1,4,MC # MC1&4
+ beq MC,.L37
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA))\
+ || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+#else
+ sll KK, 2 + BASE_SHIFT, KC # mr=4
+ sll KK, BASE_SHIFT, TEMP # nr=1
+
+ addl A,KC,A
+ addl B1,TEMP,B
+#endif
+
+ vcpys $f31,$f31,t00 # CLEAR 4 Register
+
+ LDDE b0,0(B)
+ VLD a0,0(A)
+
+ fillcs 0(CO) # fetch C
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 4, TEMP # mr=4
+#else
+ addl KK, 1, TEMP # nr=1
+#endif
+ sra TEMP,1, KC
+ beq KC,$Rest_4x1x1
+
+#else
+
+ mov B1,B # Reset B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ vcpys $f31,$f31,t00 # CLEAR 4 Register
+
+ LDDE b0,0(B)
+ VLD a0,0(A)
+
+ fillcs 0(CO) # fetch C
+ beq KC,$Rest_4x1x1
+#endif
+
+
+$Panel_4x1x2:
+ VMAD a0,b0,t00,t00
+
+ LDDE nb0,1*SIZE(B)
+ VLD a4,4*SIZE(A)
+ addl B,2*SIZE,B # 1(n)*2(k)*8Byte
+
+ subl KC,1,KC
+ VMAD a4,nb0,t00,t00
+
+ addl A,8*SIZE,A # 4m*2k
+ LDDE b0,0(B)
+ VLD a0,0(A)
+
+ bne KC,$Panel_4x1x2
+
+
+$Rest_4x1x1:
+ LDDE ALPHA,192($sp) # Get ALPHA
+#ifndef TRMMKERNEL
+ blbc KC1,$Write_4x1
+#else
+ blbc TEMP,$Write_4x1
+#endif
+
+ addl A,4*SIZE,A # 4m*1k
+ addl B,1*SIZE,B # 1n*1K
+
+ VMAD a0,b0,t00,t00
+
+
+$Write_4x1: # Write back 4 results
+
+#ifndef TRMMKERNEL
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_4x1
+
+$Align_CO_Access_4x1:
+ VLD c00,0(CO) # get 1st colum of 16c
+ VMAD t00,ALPHA,c00,t00
+ VST t00,0(CO)
+ addl CO,4*SIZE,CO # 4c
+ jmp .L37
+
+$UnAlign_CO_Access_4x1:
+ VLD_UL c00, 0*VEC_LEN*SIZE(CO)
+ VLD_UH c01, 1*VEC_LEN*SIZE(CO)
+
+ vbisw c00,c01,c00
+
+ VMAD t00,ALPHA,c00,t00
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+ addl CO,4*SIZE,CO # 4c
+
+
+#else
+ and CO, (VEC_LEN*SIZE-1),$6
+ bne $6,$UnAlign_CO_Access_4x1
+
+$Align_CO_Access_4x1:
+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register
+ VST t00,0(CO)
+ jmp $TRMMKERNEL_4x1
+
+$UnAlign_CO_Access_4x1:
+ VMUL t00,ALPHA,t00 # careful: c00~c03 use the same register
+
+ VST_UL t00, 0*VEC_LEN*SIZE(CO)
+ VST_UH t00, 1*VEC_LEN*SIZE(CO)
+
+$TRMMKERNEL_4x1:
+ addl CO,4*SIZE,CO # 4c
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 4, TEMP # mr=4
+#else
+ subl TEMP, 1, TEMP
+#endif
+
+ sll TEMP, 2 + BASE_SHIFT, KC
+ sll TEMP, BASE_SHIFT, TEMP
+
+ addl A, KC, A
+ addl B, TEMP,B
+#endif
+
+#ifdef LEFT
+ addl KK, 4, KK
+#endif
+#endif
+
+
+
+
+ .align 5
+.L37: # nr=1,mr=2-------------------------
+ and MC1,2,MC
+ beq MC,.L38
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+#else
+ sll KK, 1 + BASE_SHIFT,KC # mr=2
+ sll KK, BASE_SHIFT, TEMP # nr=1
+
+ addl A,KC, A
+ addl B1,TEMP,B
+#endif
+
+ fclr t00 # CLEAR 2 Register
+ fclr t01
+
+ LD b0,0(B)
+
+ LD a0,0(A)
+ LD a4,1*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 2,TEMP
+#else
+ addl KK, 1,TEMP
+#endif
+ sra TEMP,1,KC
+ beq KC,.L373
+
+#else
+
+ mov B1,B # Reset B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ fclr t00 # CLEAR 2 Register
+ fclr t01
+
+ LD b0,0(B)
+
+ LD a0,0(A)
+ LD a4,1*SIZE(A)
+
+ fillcs 0(CO) # fetch C
+ beq KC,.L373
+
+#endif
+
+.L371:
+ MAD a0,b0,t00,t00
+ MAD a4,b0,t01,t01
+
+ LD nb0,1*SIZE(B)
+
+ addl B,2*SIZE,B # 1(n)*2(k)
+ LD a8,2*SIZE(A)
+ LD a12,3*SIZE(A)
+
+ subl KC,1,KC
+ MAD a8,nb0,t00,t00
+ MAD a12,nb0,t01,t01
+
+ addl A,4*SIZE,A # 2m*2k
+ LD b0,0(B)
+
+ LD a0,0(A)
+ LD a4,1*SIZE(A)
+ bne KC,.L371
+
+.L373:
+ LD ALPHA,192($sp) # Get ALPHA
+#ifndef TRMMKERNEL
+ blbc KC1,.L374
+#else
+ blbc TEMP,.L374
+#endif
+
+ addl A,2*SIZE,A # 2m*1k*8Byte
+ addl B,1*SIZE,B # 1n*1K*8Byte
+
+ MAD a0,b0,t00,t00
+ MAD a4,b0,t01,t01
+
+.L374: # Write back 2 results
+
+#ifndef TRMMKERNEL
+ LD c00,0(CO)
+ LD c01,1*SIZE(CO)
+
+ MAD t00,ALPHA,c00,t00
+ MAD t01,ALPHA,c01,t01
+
+ ST t00,0(CO)
+ ST t01,1*SIZE(CO)
+ addl CO,2*SIZE,CO # 2c
+
+#else
+
+ MUL t00,ALPHA,t00
+ MUL t01,ALPHA,t01
+
+ ST t00,0(CO)
+ ST t01,1*SIZE(CO)
+
+ addl CO,2*SIZE,CO # 2c
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl KC1, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 2, TEMP
+#else
+ subl TEMP, 1, TEMP
+#endif
+
+ sll TEMP, 1 + BASE_SHIFT,KC
+ sll TEMP, BASE_SHIFT,TEMP
+
+ addl A,KC,A
+ addl B,TEMP,B
+#endif
+
+#ifdef LEFT
+ addl KK, 2, KK
+#endif
+#endif
+
+
+
+ .align 5
+.L38:
+ and MC1,1,MC
+ beq MC,$Kernel_End
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B1, B
+#else
+ sll KK, BASE_SHIFT,KC # mr=nr=1
+ nop
+
+ addl A,KC,A
+ addl B1,KC,B
+#endif
+
+ fclr t00 # CLEAR Results Register
+
+ LD b0,0(B)
+ LD a0,0(A) # Get 16 A and 4 B
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl KC1, KK, TEMP
+#else
+ addl KK, 1, TEMP # mr=nr=1
+#endif
+ sra TEMP,1,KC
+ nop
+ beq KC,.L383
+
+#else
+
+ mov B1,B # Reset B
+ sra KC1,1,KC # Unroll KC 2, KC=KC1/2
+ fclr t00 # CLEAR Results Register
+
+ LD b0,0(B)
+ LD a0,0(A) # Get 16 A and 4 B
+
+ beq KC,.L383
+#endif
+
+.L381:
+ MAD a0,b0,t00,t00
+ LD nb0,1*SIZE(B)
+
+ addl B,2*SIZE,B # 1n*2k
+ LD a8,1*SIZE(A)
+
+
+ subl KC,1,KC
+ MAD a8,nb0,t00,t00
+
+ addl A,2*SIZE,A # 1m*2k
+ LD b0,0(B)
+
+ LD a0,0(A)
+ bne KC,.L381
+
+
+.L383:
+ LD ALPHA,192($sp) # get alpha
+#ifndef TRMMKERNEL
+ blbc KC1,.L384
+#else
+ blbc TEMP,.L384
+#endif
+
+ addl A,1*SIZE,A # 1m*1k
+ addl B,1*SIZE,B # 1n*1K
+
+ MAD a0,b0,t00,t00
+
+
+.L384: # Write back 1 results
+
+#ifndef TRMMKERNEL
+ LD c00,0(CO)
+ MAD t00,ALPHA,c00,t00
+ ST t00,0(CO)
+
+#else
+ MUL t00,ALPHA,t00
+ ST t00,0(CO)
+#endif
+
+
+
+$Kernel_End:
+ ldl $9,328($sp) # Integer Saved Register
+ ldl $10,320($sp)
+ ldl $11,312($sp)
+ ldl $12,304($sp)
+ ldl $13,296($sp)
+ldl $14,288($sp)
+# Float Saved Register
+ LD $f2,280($sp)
+ LD $f3,272($sp)
+ LD $f4,264($sp)
+ LD $f5,256($sp)
+ LD $f6,248($sp)
+ LD $f7,240($sp)
+ LD $f8,232($sp)
+LD $f9,224($sp)
+
+ ldi $sp,STACKSIZE($sp) #
+ ret $31,($26),1 #
+
+ EPILOGUE
+
+
diff --git a/kernel/sw_64/gemv_n.S b/kernel/sw_64/gemv_n.S
new file mode 100644
index 0000000..90284db
--- /dev/null
+++ b/kernel/sw_64/gemv_n.S
@@ -0,0 +1,1647 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define STACKSIZE 72
+#define PREFETCHSIZE 32
+
+#define M $16
+#define N $17
+#define A $20
+#define LDA $21
+
+#define X $18
+#define INCX $19
+#define Y $22
+#define INCY $23
+
+#define BUFFER $24
+
+#define I $25
+#define J $27
+
+#define Y1 $4
+
+#define A1 $5
+#define A2 $6
+#define A3 $7
+#define A4 $8
+
+#define alpha $f19
+
+#define alpha1 $f0
+#define alpha2 $f1
+#define alpha3 $f10
+#define alpha4 $f11
+
+#define y0 $f12
+#define y1 $f13
+#define y2 $f14
+#define y3 $f15
+
+#define y4 $f16
+#define y5 $f17
+#define y6 $f18
+#define y7 $f21
+
+#define a0 $f22
+#define a1 $f23
+#define a2 $f24
+#define a3 $f25
+#define a4 $f26
+#define a5 $f27
+#define a6 $f28
+#define a7 $f29
+
+#define a8 $f2
+#define a9 $f3
+#define a10 $f4
+#define a11 $f5
+#define a12 $f6
+#define a13 $f7
+#define a14 $f8
+#define a15 $f9
+
+#define tmp $f20
+ PROLOGUE
+
+ ldi $sp, -STACKSIZE($sp)
+ ldl X, 0 + STACKSIZE($sp)
+ ldl INCX, 8 + STACKSIZE($sp)
+ ldl Y, 16 + STACKSIZE($sp)
+ ldl INCY, 24 + STACKSIZE($sp)
+ ldl BUFFER, 32 + STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ fstd tmp, 64($sp)
+ PROFCODE
+
+ cmple M, 0, $0
+ SXADDQ INCX, 0, INCX
+ cmple N, 0, $1
+ SXADDQ INCY, 0, INCY
+
+ or $0, $1, $0
+ bne $0, $L999
+
+ SXADDQ LDA, 0, LDA
+
+ cmpeq INCY, SIZE, $0
+ bne $0, $L10
+
+ mov BUFFER, Y1
+
+ mov Y, BUFFER
+ mov Y1, Y
+
+ sra M, 3, I
+ ble I, $L05
+ .align 4
+
+$L02:
+ ST $f31, 0 * SIZE(Y1)
+ ST $f31, 1 * SIZE(Y1)
+ ST $f31, 2 * SIZE(Y1)
+ ST $f31, 3 * SIZE(Y1)
+ ST $f31, 4 * SIZE(Y1)
+ ST $f31, 5 * SIZE(Y1)
+ ST $f31, 6 * SIZE(Y1)
+ ST $f31, 7 * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ ldi I, -1(I)
+ bgt I, $L02
+ .align 4
+
+$L05:
+ and M, 7, I
+ ble I, $L10
+ .align 4
+
+$L06:
+ ST $f31, 0 * SIZE(Y1)
+ addl Y1, SIZE, Y1
+
+ ldi I, -1(I)
+ bgt I, $L06
+ .align 4
+
+$L10:
+ sra N, 2, J
+ ble J, $L20
+ .align 4
+
+$L11:
+ LD alpha1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD alpha2, 0 * SIZE(X)
+ addl X, INCX, X
+ LD alpha3, 0 * SIZE(X)
+ addl X, INCX, X
+ LD alpha4, 0 * SIZE(X)
+ addl X, INCX, X
+
+ MUL alpha, alpha1, tmp
+ fmov tmp, alpha1
+ MUL alpha, alpha2, tmp
+ fmov tmp, alpha2
+ MUL alpha, alpha3, tmp
+ fmov tmp, alpha3
+ MUL alpha, alpha4, tmp
+ fmov tmp, alpha4
+
+ mov A, A1
+ addl A, LDA, A2
+ addl A2, LDA, A3
+ addl A3, LDA, A4
+ s4addl LDA, A, A
+
+ mov Y, Y1
+ ldw $31, 4 * SIZE(X)
+
+ sra M, 3, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ LD a8, 0 * SIZE(A3)
+ LD a9, 1 * SIZE(A3)
+ LD a10, 2 * SIZE(A3)
+ LD a11, 3 * SIZE(A3)
+
+ LD y4, 4 * SIZE(Y1)
+ LD y5, 5 * SIZE(Y1)
+ LD y6, 6 * SIZE(Y1)
+ LD y7, 7 * SIZE(Y1)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD a12, 0 * SIZE(A4)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD a13, 1 * SIZE(A4)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ LD a14, 2 * SIZE(A4)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ LD a15, 3 * SIZE(A4)
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ LD a0, 4 * SIZE(A1)
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+ unop
+
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ LD a1, 5 * SIZE(A1)
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+ unop
+
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ LD a2, 6 * SIZE(A1)
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+ unop
+
+ ADD y3, a3, tmp
+ fmov tmp, y3
+ LD a3, 7 * SIZE(A1)
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+ unop
+
+ ADD y0, a4, tmp
+ fmov tmp, y0
+ LD a4, 4 * SIZE(A2)
+ MUL alpha3, a8, tmp
+ fmov tmp, a8
+ unop
+
+ ADD y1, a5, tmp
+ fmov tmp, y1
+ LD a5, 5 * SIZE(A2)
+ MUL alpha3, a9, tmp
+ fmov tmp, a9
+ ldi I, -1(I)
+
+ ADD y2, a6, tmp
+ fmov tmp, y2
+ LD a6, 6 * SIZE(A2)
+ MUL alpha3, a10, tmp
+ fmov tmp, a10
+ unop
+
+ ADD y3, a7, tmp
+ fmov tmp, y3
+ LD a7, 7 * SIZE(A2)
+ MUL alpha3, a11, tmp
+ fmov tmp, a11
+ unop
+
+ ADD y0, a8, tmp
+ fmov tmp, y0
+ LD a8, 4 * SIZE(A3)
+ MUL alpha4, a12, tmp
+ fmov tmp, a12
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD y1, a9, tmp
+ fmov tmp, y1
+ LD a9, 5 * SIZE(A3)
+ MUL alpha4, a13, tmp
+ fmov tmp, a13
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
+
+ ADD y2, a10, tmp
+ fmov tmp, y2
+ LD a10, 6 * SIZE(A3)
+ MUL alpha4, a14, tmp
+ fmov tmp, a14
+ unop
+
+ ADD y3, a11, tmp
+ fmov tmp, y3
+ LD a11, 7 * SIZE(A3)
+ MUL alpha4, a15, tmp
+ fmov tmp, a15
+ ldi I, -1(I)
+
+ ADD y0, a12, tmp
+ fmov tmp, y0
+ LD a12, 4 * SIZE(A4)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
+
+ ADD y1, a13, tmp
+ fmov tmp, y1
+ LD a13, 5 * SIZE(A4)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ unop
+
+ ADD y2, a14, tmp
+ fmov tmp, y2
+ LD a14, 6 * SIZE(A4)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ unop
+
+ ADD y3, a15, tmp
+ fmov tmp, y3
+ LD a15, 7 * SIZE(A4)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2)
+
+ ADD y4, a0, tmp
+ fmov tmp, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+ LD a0, 8 * SIZE(A1)
+
+ ADD y5, a1, tmp
+ fmov tmp, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+ LD a1, 9 * SIZE(A1)
+
+ ADD y6, a2, tmp
+ fmov tmp, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+ LD a2, 10 * SIZE(A1)
+
+ ADD y7, a3, tmp
+ fmov tmp, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+ LD a3, 11 * SIZE(A1)
+
+ ADD y4, a4, tmp
+ fmov tmp, y4
+ LD a4, 8 * SIZE(A2)
+ MUL alpha3, a8, tmp
+ fmov tmp, a8
+ LD y0, 8 * SIZE(Y1)
+
+ ADD y5, a5, tmp
+ fmov tmp, y5
+ LD a5, 9 * SIZE(A2)
+ MUL alpha3, a9, tmp
+ fmov tmp, a9
+ LD y1, 9 * SIZE(Y1)
+
+ ADD y6, a6, tmp
+ fmov tmp, y6
+ LD a6, 10 * SIZE(A2)
+ MUL alpha3, a10, tmp
+ fmov tmp, a10
+ LD y2, 10 * SIZE(Y1)
+
+ ADD y7, a7, tmp
+ fmov tmp, y7
+ LD a7, 11 * SIZE(A2)
+ MUL alpha3, a11, tmp
+ fmov tmp, a11
+ LD y3, 11 * SIZE(Y1)
+
+ ADD y4, a8, tmp
+ fmov tmp, y4
+ LD a8, 8 * SIZE(A3)
+ MUL alpha4, a12, tmp
+ fmov tmp, a12
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A3)
+
+ ADD y5, a9, tmp
+ fmov tmp, y5
+ LD a9, 9 * SIZE(A3)
+ MUL alpha4, a13, tmp
+ fmov tmp, a13
+ ldi A1, 8 * SIZE(A1)
+
+ ADD y6, a10, tmp
+ fmov tmp, y6
+ LD a10, 10 * SIZE(A3)
+ MUL alpha4, a14, tmp
+ fmov tmp, a14
+ ldi A2, 8 * SIZE(A2)
+
+ ADD y7, a11, tmp
+ fmov tmp, y7
+ LD a11, 11 * SIZE(A3)
+ MUL alpha4, a15, tmp
+ fmov tmp, a15
+ ldi Y1, 8 * SIZE(Y1)
+
+ ADD y4, a12, tmp
+ fmov tmp, y4
+ LD a12, 8 * SIZE(A4)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ unop
+
+ ADD y5, a13, tmp
+ fmov tmp, y5
+ LD a13, 9 * SIZE(A4)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ ldi A3, 8 * SIZE(A3)
+
+ ADD y6, a14, tmp
+ fmov tmp, y6
+ LD a14, 10 * SIZE(A4)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A4)
+
+ ADD y7, a15, tmp
+ fmov tmp, y7
+ LD a15, 11 * SIZE(A4)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ ldi A4, 8 * SIZE(A4)
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ LD a0, 4 * SIZE(A1)
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+ ST y4, -4 * SIZE(Y1)
+
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ LD a1, 5 * SIZE(A1)
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+ ST y5, -3 * SIZE(Y1)
+
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ LD a2, 6 * SIZE(A1)
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+ ST y6, -2 * SIZE(Y1)
+
+ ADD y3, a3, tmp
+ fmov tmp, y3
+ LD a3, 7 * SIZE(A1)
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+ ST y7, -1 * SIZE(Y1)
+
+ ADD y0, a4, tmp
+ fmov tmp, y0
+ LD a4, 4 * SIZE(A2)
+ MUL alpha3, a8, tmp
+ fmov tmp, a8
+ LD y4, 4 * SIZE(Y1)
+
+ ADD y1, a5, tmp
+ fmov tmp, y1
+ LD a5, 5 * SIZE(A2)
+ MUL alpha3, a9, tmp
+ fmov tmp, a9
+ LD y5, 5 * SIZE(Y1)
+
+ ADD y2, a6, tmp
+ fmov tmp, y2
+ LD a6, 6 * SIZE(A2)
+ MUL alpha3, a10, tmp
+ fmov tmp, a10
+ LD y6, 6 * SIZE(Y1)
+
+ ADD y3, a7, tmp
+ fmov tmp, y3
+ LD a7, 7 * SIZE(A2)
+ MUL alpha3, a11, tmp
+ fmov tmp, a11
+ LD y7, 7 * SIZE(Y1)
+
+ ADD y0, a8, tmp
+ fmov tmp, y0
+ LD a8, 4 * SIZE(A3)
+ MUL alpha4, a12, tmp
+ fmov tmp, a12
+ bgt I, $L12
+ .align 4
+
+$L13:
+ ADD y1, a9, tmp
+ fmov tmp, y1
+ LD a9, 5 * SIZE(A3)
+ MUL alpha4, a13, tmp
+ fmov tmp, a13
+ unop
+
+ ADD y2, a10, tmp
+ fmov tmp, y2
+ LD a10, 6 * SIZE(A3)
+ MUL alpha4, a14, tmp
+ fmov tmp, a14
+ unop
+
+ ADD y3, a11, tmp
+ fmov tmp, y3
+ LD a11, 7 * SIZE(A3)
+ MUL alpha4, a15, tmp
+ fmov tmp, a15
+ unop
+
+ ADD y0, a12, tmp
+ fmov tmp, y0
+ LD a12, 4 * SIZE(A4)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ unop
+
+ ADD y1, a13, tmp
+ fmov tmp, y1
+ LD a13, 5 * SIZE(A4)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ unop
+
+ ADD y2, a14, tmp
+ fmov tmp, y2
+ LD a14, 6 * SIZE(A4)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ unop
+
+ ADD y3, a15, tmp
+ fmov tmp, y3
+ LD a15, 7 * SIZE(A4)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ unop
+
+ ST y0, 0 * SIZE(Y1)
+ ADD y4, a0, tmp
+ fmov tmp, y4
+ unop
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+
+ ST y1, 1 * SIZE(Y1)
+ ADD y5, a1, tmp
+ fmov tmp, y5
+ unop
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+
+ ST y2, 2 * SIZE(Y1)
+ ADD y6, a2, tmp
+ fmov tmp, y6
+ unop
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+
+ ST y3, 3 * SIZE(Y1)
+ ADD y7, a3, tmp
+ fmov tmp, y7
+ ldi Y1, 8 * SIZE(Y1)
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+
+ ADD y4, a4, tmp
+ fmov tmp, y4
+ MUL alpha3, a8, tmp
+ fmov tmp, a8
+ ADD y5, a5, tmp
+ fmov tmp, y5
+ MUL alpha3, a9, tmp
+ fmov tmp, a9
+ ADD y6, a6, tmp
+ fmov tmp, y6
+ MUL alpha3, a10, tmp
+ fmov tmp, a10
+ ADD y7, a7, tmp
+ fmov tmp, y7
+ MUL alpha3, a11, tmp
+ fmov tmp, a11
+
+ ADD y4, a8, tmp
+ fmov tmp, y4
+ MUL alpha4, a12, tmp
+ fmov tmp, a12
+ ADD y5, a9, tmp
+ fmov tmp, y5
+ MUL alpha4, a13, tmp
+ fmov tmp, a13
+ ADD y6, a10, tmp
+ fmov tmp, y6
+ MUL alpha4, a14, tmp
+ fmov tmp, a14
+ ADD y7, a11, tmp
+ fmov tmp, y7
+ MUL alpha4, a15, tmp
+ fmov tmp, a15
+
+ ADD y4, a12, tmp
+ fmov tmp, y4
+ ADD y5, a13, tmp
+ fmov tmp, y5
+ ADD y6, a14, tmp
+ fmov tmp, y6
+ ADD y7, a15, tmp
+ fmov tmp, y7
+
+ ST y4, -4 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y5, -3 * SIZE(Y1)
+ ldi A2, 8 * SIZE(A2)
+ ST y6, -2 * SIZE(Y1)
+ ldi A3, 8 * SIZE(A3)
+ ST y7, -1 * SIZE(Y1)
+ ldi A4, 8 * SIZE(A4)
+ .align 4
+
+$L15:
+ and M, 4, I
+ ble I, $L16
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ LD a8, 0 * SIZE(A3)
+ LD a9, 1 * SIZE(A3)
+ LD a10, 2 * SIZE(A3)
+ LD a11, 3 * SIZE(A3)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD a12, 0 * SIZE(A4)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD a13, 1 * SIZE(A4)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ LD a14, 2 * SIZE(A4)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ LD a15, 3 * SIZE(A4)
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+ ADD y3, a3, tmp
+ fmov tmp, y3
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+
+ ADD y0, a4, tmp
+ fmov tmp, y0
+ MUL alpha3, a8, tmp
+ fmov tmp, a8
+ ADD y1, a5, tmp
+ fmov tmp, y1
+ MUL alpha3, a9, tmp
+ fmov tmp, a9
+ ADD y2, a6, tmp
+ fmov tmp, y2
+ MUL alpha3, a10, tmp
+ fmov tmp, a10
+ ADD y3, a7, tmp
+ fmov tmp, y3
+ MUL alpha3, a11, tmp
+ fmov tmp, a11
+
+ ADD y0, a8, tmp
+ fmov tmp, y0
+ MUL alpha4, a12, tmp
+ fmov tmp, a12
+ ADD y1, a9, tmp
+ fmov tmp, y1
+ MUL alpha4, a13, tmp
+ fmov tmp, a13
+ ADD y2, a10, tmp
+ fmov tmp, y2
+ MUL alpha4, a14, tmp
+ fmov tmp, a14
+ ADD y3, a11, tmp
+ fmov tmp, y3
+ MUL alpha4, a15, tmp
+ fmov tmp, a15
+
+ ADD y0, a12, tmp
+ fmov tmp, y0
+ ldi Y1, 4 * SIZE(Y1)
+ ADD y1, a13, tmp
+ fmov tmp, y1
+ unop
+
+ ADD y2, a14, tmp
+ fmov tmp, y2
+ unop
+ ADD y3, a15, tmp
+ fmov tmp, y3
+ unop
+
+ ST y0, -4 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y1, -3 * SIZE(Y1)
+ ldi A2, 4 * SIZE(A2)
+ ST y2, -2 * SIZE(Y1)
+ ldi A3, 4 * SIZE(A3)
+ ST y3, -1 * SIZE(Y1)
+ ldi A4, 4 * SIZE(A4)
+ .align 4
+
+$L16:
+ and M, 2, I
+ ble I, $L17
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+
+ LD a4, 0 * SIZE(A3)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD a5, 1 * SIZE(A3)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD a6, 0 * SIZE(A4)
+ MUL alpha2, a2, tmp
+ fmov tmp, a2
+ LD a7, 1 * SIZE(A4)
+ MUL alpha2, a3, tmp
+ fmov tmp, a3
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ MUL alpha3, a4, tmp
+ fmov tmp, a4
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ MUL alpha3, a5, tmp
+ fmov tmp, a5
+ ADD y0, a2, tmp
+ fmov tmp, y0
+ MUL alpha4, a6, tmp
+ fmov tmp, a6
+ ADD y1, a3, tmp
+ fmov tmp, y1
+ MUL alpha4, a7, tmp
+ fmov tmp, a7
+
+ ADD y0, a4, tmp
+ fmov tmp, y0
+ ldi A1, 2 * SIZE(A1)
+ ADD y1, a5, tmp
+ fmov tmp, y1
+ ldi A2, 2 * SIZE(A2)
+ ADD y0, a6, tmp
+ fmov tmp, y0
+ ldi A3, 2 * SIZE(A3)
+ ADD y1, a7, tmp
+ fmov tmp, y1
+ ldi A4, 2 * SIZE(A4)
+
+ ST y0, 0 * SIZE(Y1)
+ unop
+ ST y1, 1 * SIZE(Y1)
+ ldi Y1, 2 * SIZE(Y1)
+ .align 4
+
+$L17:
+ blbc M, $L18
+
+ LD y0, 0 * SIZE(Y1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD a2, 0 * SIZE(A3)
+ LD a3, 0 * SIZE(A4)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ MUL alpha2, a1, tmp
+ fmov tmp, a1
+ MUL alpha3, a2, tmp
+ fmov tmp, a2
+ MUL alpha4, a3, tmp
+ fmov tmp, a3
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ ADD y0, a1, tmp
+ fmov tmp, y0
+ ADD y0, a2, tmp
+ fmov tmp, y0
+ ADD y0, a3, tmp
+ fmov tmp, y0
+
+ ST y0, 0 * SIZE(Y1)
+ .align 4
+
+$L18:
+ ldi J, -1(J)
+ bgt J, $L11
+ .align 4
+
+$L20:
+ and N, 2, J
+ ble J, $L30
+
+ LD alpha1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD alpha2, 0 * SIZE(X)
+ addl X, INCX, X
+
+ mov A, A1
+ MUL alpha, alpha1, tmp
+ fmov tmp, alpha1
+ addl A, LDA, A2
+ MUL alpha, alpha2, tmp
+ fmov tmp, alpha2
+
+ addl A2, LDA, A
+ mov Y, Y1
+
+ sra M, 3, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD y4, 4 * SIZE(Y1)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD y5, 5 * SIZE(Y1)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ LD y6, 6 * SIZE(Y1)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ LD y7, 7 * SIZE(Y1)
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ LD a0, 4 * SIZE(A1)
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ LD a1, 5 * SIZE(A1)
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ LD a2, 6 * SIZE(A1)
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+
+ ADD y3, a3, tmp
+ fmov tmp, y3
+ LD a3, 7 * SIZE(A1)
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+
+ ADD y0, a4, tmp
+ fmov tmp, y0
+ LD a4, 4 * SIZE(A2)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+
+ ADD y1, a5, tmp
+ fmov tmp, y1
+ LD a5, 5 * SIZE(A2)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+
+ ADD y2, a6, tmp
+ fmov tmp, y2
+ LD a6, 6 * SIZE(A2)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+
+ ADD y3, a7, tmp
+ fmov tmp, y3
+ LD a7, 7 * SIZE(A2)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+
+ ldi I, -1(I)
+ ble I, $L23
+ .align 4
+
+$L22:
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
+ ldi I, -1(I)
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2)
+ ldi A2, 8 * SIZE(A2)
+
+ ADD y4, a0, tmp
+ fmov tmp, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+ LD a0, 8 * SIZE(A1)
+
+ ADD y5, a1, tmp
+ fmov tmp, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+ LD a1, 9 * SIZE(A1)
+
+ ADD y6, a2, tmp
+ fmov tmp, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+ LD a2, 10 * SIZE(A1)
+
+ ADD y7, a3, tmp
+ fmov tmp, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+ LD a3, 11 * SIZE(A1)
+
+ ADD y4, a4, tmp
+ fmov tmp, y4
+ LD a4, 0 * SIZE(A2)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD y0, 8 * SIZE(Y1)
+
+ ADD y5, a5, tmp
+ fmov tmp, y5
+ LD a5, 1 * SIZE(A2)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD y1, 9 * SIZE(Y1)
+
+ ADD y6, a6, tmp
+ fmov tmp, y6
+ LD a6, 2 * SIZE(A2)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ LD y2, 10 * SIZE(Y1)
+
+ ADD y7, a7, tmp
+ fmov tmp, y7
+ LD a7, 3 * SIZE(A2)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ LD y3, 11 * SIZE(Y1)
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ ST y4, 4 * SIZE(Y1)
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+ LD a0, 12 * SIZE(A1)
+
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ ST y5, 5 * SIZE(Y1)
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+ LD a1, 13 * SIZE(A1)
+
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ ST y6, 6 * SIZE(Y1)
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+ LD a2, 14 * SIZE(A1)
+
+ ADD y3, a3, tmp
+ fmov tmp, y3
+ ST y7, 7 * SIZE(Y1)
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+ LD a3, 15 * SIZE(A1)
+
+ ADD y0, a4, tmp
+ fmov tmp, y0
+ LD a4, 4 * SIZE(A2)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD y4, 12 * SIZE(Y1)
+
+ ADD y1, a5, tmp
+ fmov tmp, y1
+ LD a5, 5 * SIZE(A2)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD y5, 13 * SIZE(Y1)
+
+ ADD y2, a6, tmp
+ fmov tmp, y2
+ LD a6, 6 * SIZE(A2)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ LD y6, 14 * SIZE(Y1)
+
+ ADD y3, a7, tmp
+ fmov tmp, y3
+ LD a7, 7 * SIZE(A2)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ LD y7, 15 * SIZE(Y1)
+
+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ldi Y1, 8 * SIZE(Y1)
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD y4, a0, tmp
+ fmov tmp, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+ unop
+
+ ADD y5, a1, tmp
+ fmov tmp, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+ unop
+
+ ADD y6, a2, tmp
+ fmov tmp, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+ unop
+
+ ADD y7, a3, tmp
+ fmov tmp, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+ unop
+
+ ADD y4, a4, tmp
+ fmov tmp, y4
+ ADD y5, a5, tmp
+ fmov tmp, y5
+ ADD y6, a6, tmp
+ fmov tmp, y6
+ ADD y7, a7, tmp
+ fmov tmp, y7
+
+ ST y4, 4 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y5, 5 * SIZE(Y1)
+ ldi A2, 8 * SIZE(A2)
+
+ ST y6, 6 * SIZE(Y1)
+ unop
+ ST y7, 7 * SIZE(Y1)
+ ldi Y1, 8 * SIZE(Y1)
+ .align 4
+
+$L25:
+ and M, 4, I
+ ble I, $L26
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD a4, 0 * SIZE(A2)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD a5, 1 * SIZE(A2)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ LD a6, 2 * SIZE(A2)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ LD a7, 3 * SIZE(A2)
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ MUL alpha2, a4, tmp
+ fmov tmp, a4
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ MUL alpha2, a5, tmp
+ fmov tmp, a5
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ MUL alpha2, a6, tmp
+ fmov tmp, a6
+ ADD y3, a3, tmp
+ fmov tmp, y3
+ MUL alpha2, a7, tmp
+ fmov tmp, a7
+
+ ADD y0, a4, tmp
+ fmov tmp, y0
+ ldi Y1, 4 * SIZE(Y1)
+ ADD y1, a5, tmp
+ fmov tmp, y1
+ unop
+ ADD y2, a6, tmp
+ fmov tmp, y2
+ unop
+ ADD y3, a7, tmp
+ fmov tmp, y3
+ unop
+
+ ST y0, -4 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y1, -3 * SIZE(Y1)
+ ldi A2, 4 * SIZE(A2)
+ ST y2, -2 * SIZE(Y1)
+ ldi A3, 4 * SIZE(A3)
+ ST y3, -1 * SIZE(Y1)
+ ldi A4, 4 * SIZE(A4)
+ .align 4
+
+$L26:
+ and M, 2, I
+ ble I, $L27
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ MUL alpha2, a2, tmp
+ fmov tmp, a2
+ MUL alpha2, a3, tmp
+ fmov tmp, a3
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ ldi A1, 2 * SIZE(A1)
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ ldi A2, 2 * SIZE(A2)
+ ADD y0, a2, tmp
+ fmov tmp, y0
+ unop
+ ADD y1, a3, tmp
+ fmov tmp, y1
+ unop
+
+ ST y0, 0 * SIZE(Y1)
+ unop
+ ST y1, 1 * SIZE(Y1)
+ ldi Y1, 2 * SIZE(Y1)
+ .align 4
+
+$L27:
+ blbc M, $L30
+
+ LD y0, 0 * SIZE(Y1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ MUL alpha2, a1, tmp
+ fmov tmp, a1
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ ADD y0, a1, tmp
+ fmov tmp, y0
+
+ ST y0, 0 * SIZE(Y1)
+ .align 4
+
+$L30:
+ blbc N, $L990
+
+ LD alpha1, 0 * SIZE(X)
+ mov A, A1
+ MUL alpha, alpha1, tmp
+ fmov tmp, alpha1
+ mov Y, Y1
+
+ sra M, 3, I
+ ble I, $L35
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+ LD a4, 4 * SIZE(A1)
+ LD a5, 5 * SIZE(A1)
+ LD a6, 6 * SIZE(A1)
+ LD a7, 7 * SIZE(A1)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+ LD y4, 4 * SIZE(Y1)
+ LD y5, 5 * SIZE(Y1)
+ LD y6, 6 * SIZE(Y1)
+ LD y7, 7 * SIZE(Y1)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+
+ ldi I, -1(I)
+ ble I, $L33
+ .align 4
+
+$L32:
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ LD y4, 4 * SIZE(Y1)
+ MUL alpha1, a4, tmp
+ fmov tmp, a4
+ LD a0, 8 * SIZE(A1)
+
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ LD y5, 5 * SIZE(Y1)
+ MUL alpha1, a5, tmp
+ fmov tmp, a5
+ LD a1, 9 * SIZE(A1)
+
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ LD y6, 6 * SIZE(Y1)
+ MUL alpha1, a6, tmp
+ fmov tmp, a6
+ LD a2, 10 * SIZE(A1)
+
+ ADD y3, a3, tmp
+ fmov tmp, y3
+ LD y7, 7 * SIZE(Y1)
+ MUL alpha1, a7, tmp
+ fmov tmp, a7
+ LD a3, 11 * SIZE(A1)
+
+ ST y0, 0 * SIZE(Y1)
+ ST y1, 1 * SIZE(Y1)
+ ST y2, 2 * SIZE(Y1)
+ ST y3, 3 * SIZE(Y1)
+
+ ADD y4, a4, tmp
+ fmov tmp, y4
+ LD y0, 8 * SIZE(Y1)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD a4, 12 * SIZE(A1)
+
+ ADD y5, a5, tmp
+ fmov tmp, y5
+ LD y1, 9 * SIZE(Y1)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD a5, 13 * SIZE(A1)
+
+ ADD y6, a6, tmp
+ fmov tmp, y6
+ LD y2, 10 * SIZE(Y1)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ LD a6, 14 * SIZE(A1)
+
+ ADD y7, a7, tmp
+ fmov tmp, y7
+ LD y3, 11 * SIZE(Y1)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ LD a7, 15 * SIZE(A1)
+
+ ST y4, 4 * SIZE(Y1)
+ ldi I, -1(I)
+ ST y5, 5 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+
+ ST y6, 6 * SIZE(Y1)
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
+ ST y7, 7 * SIZE(Y1)
+ flds $f31, (PREFETCHSIZE + 0) * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ bgt I, $L32
+ .align 4
+
+$L33:
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ LD y4, 4 * SIZE(Y1)
+ MUL alpha1, a4, tmp
+ fmov tmp, a4
+ unop
+
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ LD y5, 5 * SIZE(Y1)
+ MUL alpha1, a5, tmp
+ fmov tmp, a5
+ unop
+
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ LD y6, 6 * SIZE(Y1)
+ MUL alpha1, a6, tmp
+ fmov tmp, a6
+ unop
+
+ ADD y3, a3, tmp
+ fmov tmp, y3
+ LD y7, 7 * SIZE(Y1)
+ MUL alpha1, a7, tmp
+ fmov tmp, a7
+ unop
+
+ ADD y4, a4, tmp
+ fmov tmp, y4
+ ST y0, 0 * SIZE(Y1)
+ ADD y5, a5, tmp
+ fmov tmp, y5
+ ST y1, 1 * SIZE(Y1)
+ ADD y6, a6, tmp
+ fmov tmp, y6
+ ST y2, 2 * SIZE(Y1)
+ ADD y7, a7, tmp
+ fmov tmp, y7
+ ST y3, 3 * SIZE(Y1)
+
+ ST y4, 4 * SIZE(Y1)
+ unop
+ ST y5, 5 * SIZE(Y1)
+ unop
+
+ ST y6, 6 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y7, 7 * SIZE(Y1)
+ ldi Y1, 8 * SIZE(Y1)
+ .align 4
+
+$L35:
+ and M, 4, I
+ ble I, $L36
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+ LD y1, 1 * SIZE(Y1)
+ MUL alpha1, a2, tmp
+ fmov tmp, a2
+ LD y2, 2 * SIZE(Y1)
+ MUL alpha1, a3, tmp
+ fmov tmp, a3
+ LD y3, 3 * SIZE(Y1)
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ ADD y1, a1, tmp
+ fmov tmp, y1
+ ADD y2, a2, tmp
+ fmov tmp, y2
+ ADD y3, a3, tmp
+ fmov tmp, y3
+
+ ST y0, 0 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y1, 1 * SIZE(Y1)
+ ldi A2, 4 * SIZE(A2)
+ ST y2, 2 * SIZE(Y1)
+ unop
+ ST y3, 3 * SIZE(Y1)
+ ldi Y1, 4 * SIZE(Y1)
+ .align 4
+
+$L36:
+ and M, 2, I
+ ble I, $L37
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+ LD y1, 1 * SIZE(Y1)
+ MUL alpha1, a1, tmp
+ fmov tmp, a1
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ ADD y1, a1, tmp
+ fmov tmp, y1
+
+ ST y0, 0 * SIZE(Y1)
+ ldi A1, 2 * SIZE(A1)
+ ST y1, 1 * SIZE(Y1)
+ ldi Y1, 2 * SIZE(Y1)
+ .align 4
+
+$L37:
+ blbc M, $L990
+
+ LD y0, 0 * SIZE(Y1)
+ LD a0, 0 * SIZE(A1)
+
+ MUL alpha1, a0, tmp
+ fmov tmp, a0
+
+ ADD y0, a0, tmp
+ fmov tmp, y0
+ ST y0, 0 * SIZE(Y1)
+ .align 4
+
+$L990:
+ cmpeq INCY, SIZE, $0
+ bne $0, $L999
+
+ mov BUFFER, Y1
+
+ sra M, 3, I
+ ble I, $L995
+ .align 4
+
+$L992:
+ LD a0, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a1, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a2, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a3, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y0, 0 * SIZE(Y)
+ LD y1, 1 * SIZE(Y)
+ LD y2, 2 * SIZE(Y)
+ LD y3, 3 * SIZE(Y)
+
+ LD a4, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a5, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a6, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a7, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y4, 4 * SIZE(Y)
+ LD y5, 5 * SIZE(Y)
+ LD y6, 6 * SIZE(Y)
+ LD y7, 7 * SIZE(Y)
+
+ ADD a0, y0, tmp
+ fmov tmp, a0
+ ADD a1, y1, tmp
+ fmov tmp, a1
+ ADD a2, y2, tmp
+ fmov tmp, a2
+ ADD a3, y3, tmp
+ fmov tmp, a3
+ ADD a4, y4, tmp
+ fmov tmp, a4
+ ADD a5, y5, tmp
+ fmov tmp, a5
+ ADD a6, y6, tmp
+ fmov tmp, a6
+ ADD a7, y7, tmp
+ fmov tmp, a7
+
+ ST a0, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a1, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a2, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a3, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ST a4, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a5, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a6, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a7, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi I, -1(I)
+ ldi Y, 8 * SIZE(Y)
+ bgt I, $L992
+ .align 4
+
+$L995:
+ and M, 7, I
+ ble I, $L999
+ .align 4
+
+$L996:
+ LD a0, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y0, 0 * SIZE(Y)
+ ldi Y, 1 * SIZE(Y)
+
+ ADD a0, y0, tmp
+ fmov tmp, a0
+
+ ST a0, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi I, -1(I)
+ bgt I, $L996
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ fldd $f20, 64($sp)
+
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/gemv_n.S.bak b/kernel/sw_64/gemv_n.S.bak
new file mode 100644
index 0000000..f90abdf
--- /dev/null
+++ b/kernel/sw_64/gemv_n.S.bak
@@ -0,0 +1,1307 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define STACKSIZE 64
+#define PREFETCHSIZE 32
+
+#define M $16
+#define N $17
+#define A $20
+#define LDA $21
+
+#define X $18
+#define INCX $19
+#define Y $22
+#define INCY $23
+
+#define BUFFER $24
+
+#define I $25
+#define J $27
+
+#define Y1 $4
+
+#define A1 $5
+#define A2 $6
+#define A3 $7
+#define A4 $8
+
+#define alpha $f19
+
+#define alpha1 $f0
+#define alpha2 $f1
+#define alpha3 $f10
+#define alpha4 $f11
+
+#define y0 $f12
+#define y1 $f13
+#define y2 $f14
+#define y3 $f15
+
+#define y4 $f16
+#define y5 $f17
+#define y6 $f18
+#define y7 $f21
+
+#define a0 $f22
+#define a1 $f23
+#define a2 $f24
+#define a3 $f25
+#define a4 $f26
+#define a5 $f27
+#define a6 $f28
+#define a7 $f29
+
+#define a8 $f2
+#define a9 $f3
+#define a10 $f4
+#define a11 $f5
+#define a12 $f6
+#define a13 $f7
+#define a14 $f8
+#define a15 $f9
+
+ PROLOGUE
+
+ ldi $sp, -STACKSIZE($sp)
+ ldl X, 0 + STACKSIZE($sp)
+ ldl INCX, 8 + STACKSIZE($sp)
+ ldl Y, 16 + STACKSIZE($sp)
+ ldl INCY, 24 + STACKSIZE($sp)
+ ldl BUFFER, 32 + STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ PROFCODE
+
+ cmple M, 0, $0
+ SXADDQ INCX, 0, INCX
+ cmple N, 0, $1
+ SXADDQ INCY, 0, INCY
+
+ or $0, $1, $0
+ bne $0, $L999
+
+ SXADDQ LDA, 0, LDA
+
+ cmpeq INCY, SIZE, $0
+ bne $0, $L10
+
+ mov BUFFER, Y1
+
+ mov Y, BUFFER
+ mov Y1, Y
+
+ sra M, 3, I
+ ble I, $L05
+ .align 4
+
+$L02:
+ ST $f31, 0 * SIZE(Y1)
+ ST $f31, 1 * SIZE(Y1)
+ ST $f31, 2 * SIZE(Y1)
+ ST $f31, 3 * SIZE(Y1)
+ ST $f31, 4 * SIZE(Y1)
+ ST $f31, 5 * SIZE(Y1)
+ ST $f31, 6 * SIZE(Y1)
+ ST $f31, 7 * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ ldi I, -1(I)
+ bgt I, $L02
+ .align 4
+
+$L05:
+ and M, 7, I
+ ble I, $L10
+ .align 4
+
+$L06:
+ ST $f31, 0 * SIZE(Y1)
+ addl Y1, SIZE, Y1
+
+ ldi I, -1(I)
+ bgt I, $L06
+ .align 4
+
+$L10:
+ sra N, 2, J
+ ble J, $L20
+ .align 4
+
+$L11:
+ LD alpha1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD alpha2, 0 * SIZE(X)
+ addl X, INCX, X
+ LD alpha3, 0 * SIZE(X)
+ addl X, INCX, X
+ LD alpha4, 0 * SIZE(X)
+ addl X, INCX, X
+
+ MUL alpha, alpha1, alpha1
+ MUL alpha, alpha2, alpha2
+ MUL alpha, alpha3, alpha3
+ MUL alpha, alpha4, alpha4
+
+ mov A, A1
+ addl A, LDA, A2
+ addl A2, LDA, A3
+ addl A3, LDA, A4
+ s4addl LDA, A, A
+
+ mov Y, Y1
+ fillcs 4 * SIZE(X)
+
+ sra M, 3, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ LD a8, 0 * SIZE(A3)
+ LD a9, 1 * SIZE(A3)
+ LD a10, 2 * SIZE(A3)
+ LD a11, 3 * SIZE(A3)
+
+ LD y4, 4 * SIZE(Y1)
+ LD y5, 5 * SIZE(Y1)
+ LD y6, 6 * SIZE(Y1)
+ LD y7, 7 * SIZE(Y1)
+
+ MUL alpha1, a0, a0
+ LD a12, 0 * SIZE(A4)
+ MUL alpha1, a1, a1
+ LD a13, 1 * SIZE(A4)
+ MUL alpha1, a2, a2
+ LD a14, 2 * SIZE(A4)
+ MUL alpha1, a3, a3
+ LD a15, 3 * SIZE(A4)
+
+ ADD y0, a0, y0
+ LD a0, 4 * SIZE(A1)
+ MUL alpha2, a4, a4
+ unop
+
+ ADD y1, a1, y1
+ LD a1, 5 * SIZE(A1)
+ MUL alpha2, a5, a5
+ unop
+
+ ADD y2, a2, y2
+ LD a2, 6 * SIZE(A1)
+ MUL alpha2, a6, a6
+ unop
+
+ ADD y3, a3, y3
+ LD a3, 7 * SIZE(A1)
+ MUL alpha2, a7, a7
+ unop
+
+ ADD y0, a4, y0
+ LD a4, 4 * SIZE(A2)
+ MUL alpha3, a8, a8
+ unop
+
+ ADD y1, a5, y1
+ LD a5, 5 * SIZE(A2)
+ MUL alpha3, a9, a9
+ ldi I, -1(I)
+
+ ADD y2, a6, y2
+ LD a6, 6 * SIZE(A2)
+ MUL alpha3, a10, a10
+ unop
+
+ ADD y3, a7, y3
+ LD a7, 7 * SIZE(A2)
+ MUL alpha3, a11, a11
+ unop
+
+ ADD y0, a8, y0
+ LD a8, 4 * SIZE(A3)
+ MUL alpha4, a12, a12
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD y1, a9, y1
+ LD a9, 5 * SIZE(A3)
+ MUL alpha4, a13, a13
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+
+ ADD y2, a10, y2
+ LD a10, 6 * SIZE(A3)
+ MUL alpha4, a14, a14
+ unop
+
+ ADD y3, a11, y3
+ LD a11, 7 * SIZE(A3)
+ MUL alpha4, a15, a15
+ ldi I, -1(I)
+
+ ADD y0, a12, y0
+ LD a12, 4 * SIZE(A4)
+ MUL alpha1, a0, a0
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
+
+ ADD y1, a13, y1
+ LD a13, 5 * SIZE(A4)
+ MUL alpha1, a1, a1
+ unop
+
+ ADD y2, a14, y2
+ LD a14, 6 * SIZE(A4)
+ MUL alpha1, a2, a2
+ unop
+
+ ADD y3, a15, y3
+ LD a15, 7 * SIZE(A4)
+ MUL alpha1, a3, a3
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
+
+ ADD y4, a0, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a4, a4
+ LD a0, 8 * SIZE(A1)
+
+ ADD y5, a1, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a5, a5
+ LD a1, 9 * SIZE(A1)
+
+ ADD y6, a2, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a6, a6
+ LD a2, 10 * SIZE(A1)
+
+ ADD y7, a3, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a7, a7
+ LD a3, 11 * SIZE(A1)
+
+ ADD y4, a4, y4
+ LD a4, 8 * SIZE(A2)
+ MUL alpha3, a8, a8
+ LD y0, 8 * SIZE(Y1)
+
+ ADD y5, a5, y5
+ LD a5, 9 * SIZE(A2)
+ MUL alpha3, a9, a9
+ LD y1, 9 * SIZE(Y1)
+
+ ADD y6, a6, y6
+ LD a6, 10 * SIZE(A2)
+ MUL alpha3, a10, a10
+ LD y2, 10 * SIZE(Y1)
+
+ ADD y7, a7, y7
+ LD a7, 11 * SIZE(A2)
+ MUL alpha3, a11, a11
+ LD y3, 11 * SIZE(Y1)
+
+ ADD y4, a8, y4
+ LD a8, 8 * SIZE(A3)
+ MUL alpha4, a12, a12
+ fillcs (PREFETCHSIZE + 0) * SIZE(A3)
+
+ ADD y5, a9, y5
+ LD a9, 9 * SIZE(A3)
+ MUL alpha4, a13, a13
+ ldi A1, 8 * SIZE(A1)
+
+ ADD y6, a10, y6
+ LD a10, 10 * SIZE(A3)
+ MUL alpha4, a14, a14
+ ldi A2, 8 * SIZE(A2)
+
+ ADD y7, a11, y7
+ LD a11, 11 * SIZE(A3)
+ MUL alpha4, a15, a15
+ ldi Y1, 8 * SIZE(Y1)
+
+ ADD y4, a12, y4
+ LD a12, 8 * SIZE(A4)
+ MUL alpha1, a0, a0
+ unop
+
+ ADD y5, a13, y5
+ LD a13, 9 * SIZE(A4)
+ MUL alpha1, a1, a1
+ ldi A3, 8 * SIZE(A3)
+
+ ADD y6, a14, y6
+ LD a14, 10 * SIZE(A4)
+ MUL alpha1, a2, a2
+ fillcs (PREFETCHSIZE + 0) * SIZE(A4)
+
+ ADD y7, a15, y7
+ LD a15, 11 * SIZE(A4)
+ MUL alpha1, a3, a3
+ ldi A4, 8 * SIZE(A4)
+
+ ADD y0, a0, y0
+ LD a0, 4 * SIZE(A1)
+ MUL alpha2, a4, a4
+ ST y4, -4 * SIZE(Y1)
+
+ ADD y1, a1, y1
+ LD a1, 5 * SIZE(A1)
+ MUL alpha2, a5, a5
+ ST y5, -3 * SIZE(Y1)
+
+ ADD y2, a2, y2
+ LD a2, 6 * SIZE(A1)
+ MUL alpha2, a6, a6
+ ST y6, -2 * SIZE(Y1)
+
+ ADD y3, a3, y3
+ LD a3, 7 * SIZE(A1)
+ MUL alpha2, a7, a7
+ ST y7, -1 * SIZE(Y1)
+
+ ADD y0, a4, y0
+ LD a4, 4 * SIZE(A2)
+ MUL alpha3, a8, a8
+ LD y4, 4 * SIZE(Y1)
+
+ ADD y1, a5, y1
+ LD a5, 5 * SIZE(A2)
+ MUL alpha3, a9, a9
+ LD y5, 5 * SIZE(Y1)
+
+ ADD y2, a6, y2
+ LD a6, 6 * SIZE(A2)
+ MUL alpha3, a10, a10
+ LD y6, 6 * SIZE(Y1)
+
+ ADD y3, a7, y3
+ LD a7, 7 * SIZE(A2)
+ MUL alpha3, a11, a11
+ LD y7, 7 * SIZE(Y1)
+
+ ADD y0, a8, y0
+ LD a8, 4 * SIZE(A3)
+ MUL alpha4, a12, a12
+ bgt I, $L12
+ .align 4
+
+$L13:
+ ADD y1, a9, y1
+ LD a9, 5 * SIZE(A3)
+ MUL alpha4, a13, a13
+ unop
+
+ ADD y2, a10, y2
+ LD a10, 6 * SIZE(A3)
+ MUL alpha4, a14, a14
+ unop
+
+ ADD y3, a11, y3
+ LD a11, 7 * SIZE(A3)
+ MUL alpha4, a15, a15
+ unop
+
+ ADD y0, a12, y0
+ LD a12, 4 * SIZE(A4)
+ MUL alpha1, a0, a0
+ unop
+
+ ADD y1, a13, y1
+ LD a13, 5 * SIZE(A4)
+ MUL alpha1, a1, a1
+ unop
+
+ ADD y2, a14, y2
+ LD a14, 6 * SIZE(A4)
+ MUL alpha1, a2, a2
+ unop
+
+ ADD y3, a15, y3
+ LD a15, 7 * SIZE(A4)
+ MUL alpha1, a3, a3
+ unop
+
+ ST y0, 0 * SIZE(Y1)
+ ADD y4, a0, y4
+ unop
+ MUL alpha2, a4, a4
+
+ ST y1, 1 * SIZE(Y1)
+ ADD y5, a1, y5
+ unop
+ MUL alpha2, a5, a5
+
+ ST y2, 2 * SIZE(Y1)
+ ADD y6, a2, y6
+ unop
+ MUL alpha2, a6, a6
+
+ ST y3, 3 * SIZE(Y1)
+ ADD y7, a3, y7
+ ldi Y1, 8 * SIZE(Y1)
+ MUL alpha2, a7, a7
+
+ ADD y4, a4, y4
+ MUL alpha3, a8, a8
+ ADD y5, a5, y5
+ MUL alpha3, a9, a9
+ ADD y6, a6, y6
+ MUL alpha3, a10, a10
+ ADD y7, a7, y7
+ MUL alpha3, a11, a11
+
+ ADD y4, a8, y4
+ MUL alpha4, a12, a12
+ ADD y5, a9, y5
+ MUL alpha4, a13, a13
+ ADD y6, a10, y6
+ MUL alpha4, a14, a14
+ ADD y7, a11, y7
+ MUL alpha4, a15, a15
+
+ ADD y4, a12, y4
+ ADD y5, a13, y5
+ ADD y6, a14, y6
+ ADD y7, a15, y7
+
+ ST y4, -4 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y5, -3 * SIZE(Y1)
+ ldi A2, 8 * SIZE(A2)
+ ST y6, -2 * SIZE(Y1)
+ ldi A3, 8 * SIZE(A3)
+ ST y7, -1 * SIZE(Y1)
+ ldi A4, 8 * SIZE(A4)
+ .align 4
+
+$L15:
+ and M, 4, I
+ ble I, $L16
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ LD a8, 0 * SIZE(A3)
+ LD a9, 1 * SIZE(A3)
+ LD a10, 2 * SIZE(A3)
+ LD a11, 3 * SIZE(A3)
+
+ MUL alpha1, a0, a0
+ LD a12, 0 * SIZE(A4)
+ MUL alpha1, a1, a1
+ LD a13, 1 * SIZE(A4)
+ MUL alpha1, a2, a2
+ LD a14, 2 * SIZE(A4)
+ MUL alpha1, a3, a3
+ LD a15, 3 * SIZE(A4)
+
+ ADD y0, a0, y0
+ MUL alpha2, a4, a4
+ ADD y1, a1, y1
+ MUL alpha2, a5, a5
+ ADD y2, a2, y2
+ MUL alpha2, a6, a6
+ ADD y3, a3, y3
+ MUL alpha2, a7, a7
+
+ ADD y0, a4, y0
+ MUL alpha3, a8, a8
+ ADD y1, a5, y1
+ MUL alpha3, a9, a9
+ ADD y2, a6, y2
+ MUL alpha3, a10, a10
+ ADD y3, a7, y3
+ MUL alpha3, a11, a11
+
+ ADD y0, a8, y0
+ MUL alpha4, a12, a12
+ ADD y1, a9, y1
+ MUL alpha4, a13, a13
+ ADD y2, a10, y2
+ MUL alpha4, a14, a14
+ ADD y3, a11, y3
+ MUL alpha4, a15, a15
+
+ ADD y0, a12, y0
+ ldi Y1, 4 * SIZE(Y1)
+ ADD y1, a13, y1
+ unop
+
+ ADD y2, a14, y2
+ unop
+ ADD y3, a15, y3
+ unop
+
+ ST y0, -4 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y1, -3 * SIZE(Y1)
+ ldi A2, 4 * SIZE(A2)
+ ST y2, -2 * SIZE(Y1)
+ ldi A3, 4 * SIZE(A3)
+ ST y3, -1 * SIZE(Y1)
+ ldi A4, 4 * SIZE(A4)
+ .align 4
+
+$L16:
+ and M, 2, I
+ ble I, $L17
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+
+ LD a4, 0 * SIZE(A3)
+ MUL alpha1, a0, a0
+ LD a5, 1 * SIZE(A3)
+ MUL alpha1, a1, a1
+ LD a6, 0 * SIZE(A4)
+ MUL alpha2, a2, a2
+ LD a7, 1 * SIZE(A4)
+ MUL alpha2, a3, a3
+
+ ADD y0, a0, y0
+ MUL alpha3, a4, a4
+ ADD y1, a1, y1
+ MUL alpha3, a5, a5
+ ADD y0, a2, y0
+ MUL alpha4, a6, a6
+ ADD y1, a3, y1
+ MUL alpha4, a7, a7
+
+ ADD y0, a4, y0
+ ldi A1, 2 * SIZE(A1)
+ ADD y1, a5, y1
+ ldi A2, 2 * SIZE(A2)
+ ADD y0, a6, y0
+ ldi A3, 2 * SIZE(A3)
+ ADD y1, a7, y1
+ ldi A4, 2 * SIZE(A4)
+
+ ST y0, 0 * SIZE(Y1)
+ unop
+ ST y1, 1 * SIZE(Y1)
+ ldi Y1, 2 * SIZE(Y1)
+ .align 4
+
+$L17:
+ blbc M, $L18
+
+ LD y0, 0 * SIZE(Y1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD a2, 0 * SIZE(A3)
+ LD a3, 0 * SIZE(A4)
+
+ MUL alpha1, a0, a0
+ MUL alpha2, a1, a1
+ MUL alpha3, a2, a2
+ MUL alpha4, a3, a3
+
+ ADD y0, a0, y0
+ ADD y0, a1, y0
+ ADD y0, a2, y0
+ ADD y0, a3, y0
+
+ ST y0, 0 * SIZE(Y1)
+ .align 4
+
+$L18:
+ ldi J, -1(J)
+ bgt J, $L11
+ .align 4
+
+$L20:
+ and N, 2, J
+ ble J, $L30
+
+ LD alpha1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD alpha2, 0 * SIZE(X)
+ addl X, INCX, X
+
+ mov A, A1
+ MUL alpha, alpha1, alpha1
+ addl A, LDA, A2
+ MUL alpha, alpha2, alpha2
+
+ addl A2, LDA, A
+ mov Y, Y1
+
+ sra M, 3, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ MUL alpha1, a0, a0
+ LD y4, 4 * SIZE(Y1)
+ MUL alpha1, a1, a1
+ LD y5, 5 * SIZE(Y1)
+ MUL alpha1, a2, a2
+ LD y6, 6 * SIZE(Y1)
+ MUL alpha1, a3, a3
+ LD y7, 7 * SIZE(Y1)
+
+ ADD y0, a0, y0
+ LD a0, 4 * SIZE(A1)
+ MUL alpha2, a4, a4
+
+ ADD y1, a1, y1
+ LD a1, 5 * SIZE(A1)
+ MUL alpha2, a5, a5
+
+ ADD y2, a2, y2
+ LD a2, 6 * SIZE(A1)
+ MUL alpha2, a6, a6
+
+ ADD y3, a3, y3
+ LD a3, 7 * SIZE(A1)
+ MUL alpha2, a7, a7
+
+ ADD y0, a4, y0
+ LD a4, 4 * SIZE(A2)
+ MUL alpha1, a0, a0
+
+ ADD y1, a5, y1
+ LD a5, 5 * SIZE(A2)
+ MUL alpha1, a1, a1
+
+ ADD y2, a6, y2
+ LD a6, 6 * SIZE(A2)
+ MUL alpha1, a2, a2
+
+ ADD y3, a7, y3
+ LD a7, 7 * SIZE(A2)
+ MUL alpha1, a3, a3
+
+ ldi I, -1(I)
+ ble I, $L23
+ .align 4
+
+$L22:
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+ ldi I, -1(I)
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
+ ldi A2, 8 * SIZE(A2)
+
+ ADD y4, a0, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a4, a4
+ LD a0, 8 * SIZE(A1)
+
+ ADD y5, a1, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a5, a5
+ LD a1, 9 * SIZE(A1)
+
+ ADD y6, a2, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a6, a6
+ LD a2, 10 * SIZE(A1)
+
+ ADD y7, a3, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a7, a7
+ LD a3, 11 * SIZE(A1)
+
+ ADD y4, a4, y4
+ LD a4, 0 * SIZE(A2)
+ MUL alpha1, a0, a0
+ LD y0, 8 * SIZE(Y1)
+
+ ADD y5, a5, y5
+ LD a5, 1 * SIZE(A2)
+ MUL alpha1, a1, a1
+ LD y1, 9 * SIZE(Y1)
+
+ ADD y6, a6, y6
+ LD a6, 2 * SIZE(A2)
+ MUL alpha1, a2, a2
+ LD y2, 10 * SIZE(Y1)
+
+ ADD y7, a7, y7
+ LD a7, 3 * SIZE(A2)
+ MUL alpha1, a3, a3
+ LD y3, 11 * SIZE(Y1)
+
+ ADD y0, a0, y0
+ ST y4, 4 * SIZE(Y1)
+ MUL alpha2, a4, a4
+ LD a0, 12 * SIZE(A1)
+
+ ADD y1, a1, y1
+ ST y5, 5 * SIZE(Y1)
+ MUL alpha2, a5, a5
+ LD a1, 13 * SIZE(A1)
+
+ ADD y2, a2, y2
+ ST y6, 6 * SIZE(Y1)
+ MUL alpha2, a6, a6
+ LD a2, 14 * SIZE(A1)
+
+ ADD y3, a3, y3
+ ST y7, 7 * SIZE(Y1)
+ MUL alpha2, a7, a7
+ LD a3, 15 * SIZE(A1)
+
+ ADD y0, a4, y0
+ LD a4, 4 * SIZE(A2)
+ MUL alpha1, a0, a0
+ LD y4, 12 * SIZE(Y1)
+
+ ADD y1, a5, y1
+ LD a5, 5 * SIZE(A2)
+ MUL alpha1, a1, a1
+ LD y5, 13 * SIZE(Y1)
+
+ ADD y2, a6, y2
+ LD a6, 6 * SIZE(A2)
+ MUL alpha1, a2, a2
+ LD y6, 14 * SIZE(Y1)
+
+ ADD y3, a7, y3
+ LD a7, 7 * SIZE(A2)
+ MUL alpha1, a3, a3
+ LD y7, 15 * SIZE(Y1)
+
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ldi Y1, 8 * SIZE(Y1)
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD y4, a0, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a4, a4
+ unop
+
+ ADD y5, a1, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a5, a5
+ unop
+
+ ADD y6, a2, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a6, a6
+ unop
+
+ ADD y7, a3, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a7, a7
+ unop
+
+ ADD y4, a4, y4
+ ADD y5, a5, y5
+ ADD y6, a6, y6
+ ADD y7, a7, y7
+
+ ST y4, 4 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y5, 5 * SIZE(Y1)
+ ldi A2, 8 * SIZE(A2)
+
+ ST y6, 6 * SIZE(Y1)
+ unop
+ ST y7, 7 * SIZE(Y1)
+ ldi Y1, 8 * SIZE(Y1)
+ .align 4
+
+$L25:
+ and M, 4, I
+ ble I, $L26
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ MUL alpha1, a0, a0
+ LD a4, 0 * SIZE(A2)
+ MUL alpha1, a1, a1
+ LD a5, 1 * SIZE(A2)
+ MUL alpha1, a2, a2
+ LD a6, 2 * SIZE(A2)
+ MUL alpha1, a3, a3
+ LD a7, 3 * SIZE(A2)
+
+ ADD y0, a0, y0
+ MUL alpha2, a4, a4
+ ADD y1, a1, y1
+ MUL alpha2, a5, a5
+ ADD y2, a2, y2
+ MUL alpha2, a6, a6
+ ADD y3, a3, y3
+ MUL alpha2, a7, a7
+
+ ADD y0, a4, y0
+ ldi Y1, 4 * SIZE(Y1)
+ ADD y1, a5, y1
+ unop
+ ADD y2, a6, y2
+ unop
+ ADD y3, a7, y3
+ unop
+
+ ST y0, -4 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y1, -3 * SIZE(Y1)
+ ldi A2, 4 * SIZE(A2)
+ ST y2, -2 * SIZE(Y1)
+ ldi A3, 4 * SIZE(A3)
+ ST y3, -1 * SIZE(Y1)
+ ldi A4, 4 * SIZE(A4)
+ .align 4
+
+$L26:
+ and M, 2, I
+ ble I, $L27
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+
+ MUL alpha1, a0, a0
+ MUL alpha1, a1, a1
+ MUL alpha2, a2, a2
+ MUL alpha2, a3, a3
+
+ ADD y0, a0, y0
+ ldi A1, 2 * SIZE(A1)
+ ADD y1, a1, y1
+ ldi A2, 2 * SIZE(A2)
+ ADD y0, a2, y0
+ unop
+ ADD y1, a3, y1
+ unop
+
+ ST y0, 0 * SIZE(Y1)
+ unop
+ ST y1, 1 * SIZE(Y1)
+ ldi Y1, 2 * SIZE(Y1)
+ .align 4
+
+$L27:
+ blbc M, $L30
+
+ LD y0, 0 * SIZE(Y1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+
+ MUL alpha1, a0, a0
+ MUL alpha2, a1, a1
+
+ ADD y0, a0, y0
+ ADD y0, a1, y0
+
+ ST y0, 0 * SIZE(Y1)
+ .align 4
+
+$L30:
+ blbc N, $L990
+
+ LD alpha1, 0 * SIZE(X)
+ mov A, A1
+ MUL alpha, alpha1, alpha1
+ mov Y, Y1
+
+ sra M, 3, I
+ ble I, $L35
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+ LD a4, 4 * SIZE(A1)
+ LD a5, 5 * SIZE(A1)
+ LD a6, 6 * SIZE(A1)
+ LD a7, 7 * SIZE(A1)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+ LD y4, 4 * SIZE(Y1)
+ LD y5, 5 * SIZE(Y1)
+ LD y6, 6 * SIZE(Y1)
+ LD y7, 7 * SIZE(Y1)
+
+ MUL alpha1, a0, a0
+ MUL alpha1, a1, a1
+ MUL alpha1, a2, a2
+ MUL alpha1, a3, a3
+
+ ldi I, -1(I)
+ ble I, $L33
+ .align 4
+
+$L32:
+ ADD y0, a0, y0
+ LD y4, 4 * SIZE(Y1)
+ MUL alpha1, a4, a4
+ LD a0, 8 * SIZE(A1)
+
+ ADD y1, a1, y1
+ LD y5, 5 * SIZE(Y1)
+ MUL alpha1, a5, a5
+ LD a1, 9 * SIZE(A1)
+
+ ADD y2, a2, y2
+ LD y6, 6 * SIZE(Y1)
+ MUL alpha1, a6, a6
+ LD a2, 10 * SIZE(A1)
+
+ ADD y3, a3, y3
+ LD y7, 7 * SIZE(Y1)
+ MUL alpha1, a7, a7
+ LD a3, 11 * SIZE(A1)
+
+ ST y0, 0 * SIZE(Y1)
+ ST y1, 1 * SIZE(Y1)
+ ST y2, 2 * SIZE(Y1)
+ ST y3, 3 * SIZE(Y1)
+
+ ADD y4, a4, y4
+ LD y0, 8 * SIZE(Y1)
+ MUL alpha1, a0, a0
+ LD a4, 12 * SIZE(A1)
+
+ ADD y5, a5, y5
+ LD y1, 9 * SIZE(Y1)
+ MUL alpha1, a1, a1
+ LD a5, 13 * SIZE(A1)
+
+ ADD y6, a6, y6
+ LD y2, 10 * SIZE(Y1)
+ MUL alpha1, a2, a2
+ LD a6, 14 * SIZE(A1)
+
+ ADD y7, a7, y7
+ LD y3, 11 * SIZE(Y1)
+ MUL alpha1, a3, a3
+ LD a7, 15 * SIZE(A1)
+
+ ST y4, 4 * SIZE(Y1)
+ ldi I, -1(I)
+ ST y5, 5 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+
+ ST y6, 6 * SIZE(Y1)
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+ ST y7, 7 * SIZE(Y1)
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ bgt I, $L32
+ .align 4
+
+$L33:
+ ADD y0, a0, y0
+ LD y4, 4 * SIZE(Y1)
+ MUL alpha1, a4, a4
+ unop
+
+ ADD y1, a1, y1
+ LD y5, 5 * SIZE(Y1)
+ MUL alpha1, a5, a5
+ unop
+
+ ADD y2, a2, y2
+ LD y6, 6 * SIZE(Y1)
+ MUL alpha1, a6, a6
+ unop
+
+ ADD y3, a3, y3
+ LD y7, 7 * SIZE(Y1)
+ MUL alpha1, a7, a7
+ unop
+
+ ADD y4, a4, y4
+ ST y0, 0 * SIZE(Y1)
+ ADD y5, a5, y5
+ ST y1, 1 * SIZE(Y1)
+ ADD y6, a6, y6
+ ST y2, 2 * SIZE(Y1)
+ ADD y7, a7, y7
+ ST y3, 3 * SIZE(Y1)
+
+ ST y4, 4 * SIZE(Y1)
+ unop
+ ST y5, 5 * SIZE(Y1)
+ unop
+
+ ST y6, 6 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y7, 7 * SIZE(Y1)
+ ldi Y1, 8 * SIZE(Y1)
+ .align 4
+
+$L35:
+ and M, 4, I
+ ble I, $L36
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ MUL alpha1, a0, a0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, a1
+ LD y1, 1 * SIZE(Y1)
+ MUL alpha1, a2, a2
+ LD y2, 2 * SIZE(Y1)
+ MUL alpha1, a3, a3
+ LD y3, 3 * SIZE(Y1)
+
+ ADD y0, a0, y0
+ ADD y1, a1, y1
+ ADD y2, a2, y2
+ ADD y3, a3, y3
+
+ ST y0, 0 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y1, 1 * SIZE(Y1)
+ ldi A2, 4 * SIZE(A2)
+ ST y2, 2 * SIZE(Y1)
+ unop
+ ST y3, 3 * SIZE(Y1)
+ ldi Y1, 4 * SIZE(Y1)
+ .align 4
+
+$L36:
+ and M, 2, I
+ ble I, $L37
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a0, a0
+ LD y1, 1 * SIZE(Y1)
+ MUL alpha1, a1, a1
+
+ ADD y0, a0, y0
+ ADD y1, a1, y1
+
+ ST y0, 0 * SIZE(Y1)
+ ldi A1, 2 * SIZE(A1)
+ ST y1, 1 * SIZE(Y1)
+ ldi Y1, 2 * SIZE(Y1)
+ .align 4
+
+$L37:
+ blbc M, $L990
+
+ LD y0, 0 * SIZE(Y1)
+ LD a0, 0 * SIZE(A1)
+
+ MUL alpha1, a0, a0
+
+ ADD y0, a0, y0
+ ST y0, 0 * SIZE(Y1)
+ .align 4
+
+$L990:
+ cmpeq INCY, SIZE, $0
+ bne $0, $L999
+
+ mov BUFFER, Y1
+
+ sra M, 3, I
+ ble I, $L995
+ .align 4
+
+$L992:
+ LD a0, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a1, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a2, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a3, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y0, 0 * SIZE(Y)
+ LD y1, 1 * SIZE(Y)
+ LD y2, 2 * SIZE(Y)
+ LD y3, 3 * SIZE(Y)
+
+ LD a4, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a5, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a6, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a7, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y4, 4 * SIZE(Y)
+ LD y5, 5 * SIZE(Y)
+ LD y6, 6 * SIZE(Y)
+ LD y7, 7 * SIZE(Y)
+
+ ADD a0, y0, a0
+ ADD a1, y1, a1
+ ADD a2, y2, a2
+ ADD a3, y3, a3
+ ADD a4, y4, a4
+ ADD a5, y5, a5
+ ADD a6, y6, a6
+ ADD a7, y7, a7
+
+ ST a0, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a1, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a2, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a3, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ST a4, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a5, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a6, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a7, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi I, -1(I)
+ ldi Y, 8 * SIZE(Y)
+ bgt I, $L992
+ .align 4
+
+$L995:
+ and M, 7, I
+ ble I, $L999
+ .align 4
+
+$L996:
+ LD a0, 0 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y0, 0 * SIZE(Y)
+ ldi Y, 1 * SIZE(Y)
+
+ ADD a0, y0, a0
+
+ ST a0, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi I, -1(I)
+ bgt I, $L996
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/gemv_t.S b/kernel/sw_64/gemv_t.S
new file mode 100644
index 0000000..4d8f130
--- /dev/null
+++ b/kernel/sw_64/gemv_t.S
@@ -0,0 +1,1222 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define STACKSIZE 72
+#define PREFETCHSIZE 32
+
+#define M $16
+#define N $17
+#define A $20
+#define LDA $21
+
+#define X $18
+#define INCX $19
+#define Y $22
+#define INCY $23
+
+#define BUFFER $24
+
+#define I $25
+#define J $27
+
+#define X1 $3
+#define Y1 $4
+
+#define A1 $5
+#define A2 $6
+#define A3 $7
+#define A4 $8
+
+#define alpha $f19
+#define f20 $f20
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f21
+
+#define a0 $f22
+#define a1 $f23
+#define a2 $f24
+#define a3 $f25
+#define a4 $f26
+#define a5 $f27
+#define a6 $f28
+#define a7 $f29
+
+#define a8 $f2
+#define a9 $f3
+#define a10 $f4
+#define a11 $f5
+#define a12 $f6
+#define a13 $f7
+#define a14 $f8
+#define a15 $f9
+
+ PROLOGUE
+
+ ldi $sp, -STACKSIZE($sp)
+ ldl X, 0 + STACKSIZE($sp)
+ ldl INCX, 8 + STACKSIZE($sp)
+ ldl Y, 16 + STACKSIZE($sp)
+ ldl INCY, 24 + STACKSIZE($sp)
+ ldl BUFFER, 32 + STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ fstd f20, 64($sp)
+
+ PROFCODE
+
+ cmple M, 0, $0
+ SXADDQ INCX, 0, INCX
+ cmple N, 0, $1
+ SXADDQ INCY, 0, INCY
+
+ or $0, $1, $0
+ bne $0, $L999
+
+ cmpeq INCX, SIZE, $0
+ mov X, X1
+ SXADDQ LDA, 0, LDA
+ bne $0, $L10
+
+ sra M, 3, I
+ mov BUFFER, Y1
+ mov BUFFER, X
+ ble I, $L05
+ .align 4
+
+$L02:
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(X1)
+ ldi I, -1(I)
+
+ LD a0, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a1, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a2, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a3, 0 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ ST a2, 2 * SIZE(Y1)
+ ST a3, 3 * SIZE(Y1)
+
+ LD a4, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a5, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a6, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a7, 0 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a4, 4 * SIZE(Y1)
+ ST a5, 5 * SIZE(Y1)
+ ST a6, 6 * SIZE(Y1)
+ ST a7, 7 * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ bgt I, $L02
+ .align 4
+
+$L05:
+ and M, 7, I
+ ble I, $L10
+ .align 4
+
+$L06:
+ LD a0, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ ST a0, 0 * SIZE(Y1)
+ addl Y1, SIZE, Y1
+
+ ldi I, -1(I)
+ bgt I, $L06
+ .align 4
+
+$L10:
+ mov Y, Y1
+ fclr t0
+ unop
+ fclr t1
+
+ sra N, 2, J
+ fclr t2
+ fclr t3
+ ble J, $L20
+ .align 4
+
+$L11:
+ mov A, A1
+ fclr s0
+ addl A, LDA, A2
+ fclr s1
+
+ addl A2, LDA, A3
+ fclr s2
+ addl A3, LDA, A4
+ fclr s3
+
+ s4addl LDA, A, A
+ unop
+ mov X, X1
+ flds $f31, 3 * SIZE(Y)
+
+ sra M, 3, I
+ ble I, $L15
+
+ LD x0, 0 * SIZE(X1)
+ LD x1, 1 * SIZE(X1)
+ LD x2, 2 * SIZE(X1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD a2, 0 * SIZE(A3)
+ LD a3, 0 * SIZE(A4)
+ LD a4, 1 * SIZE(A1)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 1 * SIZE(A3)
+ LD a7, 1 * SIZE(A4)
+ LD a8, 2 * SIZE(A1)
+ LD a9, 2 * SIZE(A2)
+ LD a10, 2 * SIZE(A3)
+ LD a11, 2 * SIZE(A4)
+ LD a12, 3 * SIZE(A1)
+ LD a13, 3 * SIZE(A2)
+ LD a14, 3 * SIZE(A3)
+ LD a15, 3 * SIZE(A4)
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x3, 3 * SIZE(X1)
+ MUL x0, a0, t0
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
+ MUL x0, a1, t1
+ LD a1, 4 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ unop
+ MUL x0, a2, t2
+ LD a2, 4 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20, s3
+ LD a0, 4 * SIZE(A1)
+ unop
+ MUL x0, a3, t3
+ LD a3, 4 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x0, 4 * SIZE(X1)
+ MUL x1, a4, t0
+
+ ADD s1, t1, f20
+ fmov f20, s1
+ LD a4, 5 * SIZE(A1)
+ ldi A1, 8 * SIZE(A1)
+ MUL x1, a5, t1
+ LD a5, 5 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ #unop
+ MUL x1, a6, t2
+ LD a6, 5 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ #unop
+ MUL x1, a7, t3
+ LD a7, 5 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x1, 5 * SIZE(X1)
+ MUL x2, a8, t0
+ LD a8, -2 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2)
+ MUL x2, a9, t1
+ LD a9, 6 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ ldi A2, 8 * SIZE(A2)
+ MUL x2, a10, t2
+ LD a10, 6 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ ldi A3, 8 * SIZE(A3)
+ MUL x2, a11, t3
+ LD a11, 6 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x2, 6 * SIZE(X1)
+ MUL x3, a12, t0
+ LD a12, -1 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldi A4, 8 * SIZE(A4)
+ MUL x3, a13, t1
+ LD a13, -1 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ unop
+ MUL x3, a14, t2
+ LD a14, -1 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ unop
+ MUL x3, a15, t3
+ LD a15, -1 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x3, 7 * SIZE(X1)
+ MUL x0, a0, t0
+ LD a0, 0 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldw $31, (PREFETCHSIZE - 8) * SIZE(A3)
+ MUL x0, a1, t1
+ LD a1, 0 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ unop
+ MUL x0, a2, t2
+ LD a2, 0 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ unop
+ MUL x0, a3, t3
+ LD a3, 0 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x0, 8 * SIZE(X1)
+ MUL x1, a4, t0
+ LD a4, 1 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ unop
+ MUL x1, a5, t1
+ LD a5, 1 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ unop
+ MUL x1, a6, t2
+ LD a6, 1 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ unop
+ MUL x1, a7, t3
+ LD a7, 1 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x1, 9 * SIZE(X1)
+ MUL x2, a8, t0
+ LD a8, 2 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldw $31, (PREFETCHSIZE - 8) * SIZE(A4)
+ MUL x2, a9, t1
+ LD a9, 2 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ ldi X1, 8 * SIZE(X1)
+ MUL x2, a10, t2
+ LD a10, 2 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ ldi I, -1(I)
+ MUL x2, a11, t3
+ LD a11, 2 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x2, 2 * SIZE(X1)
+ MUL x3, a12, t0
+ LD a12, 3 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldw $31, (PREFETCHSIZE - 8) * SIZE(X1)
+ MUL x3, a13, t1
+ LD a13, 3 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ unop
+ MUL x3, a14, t2
+ LD a14, 3 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ MUL x3, a15, t3
+ LD a15, 3 * SIZE(A4)
+ bgt I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x3, 3 * SIZE(X1)
+ MUL x0, a0, t0
+ LD a0, 4 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ #unop
+ MUL x0, a1, t1
+ LD a1, 4 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ #unop
+ MUL x0, a2, t2
+ LD a2, 4 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ #unop
+ MUL x0, a3, t3
+ LD a3, 4 * SIZE(A4)
+
+ ADD s0, t0, x0
+ fmov x0,s0
+ LD x0, 4 * SIZE(X1)
+ MUL x1, a4, t0
+ LD a4, 5 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ #unop
+ MUL x1, a5, t1
+ LD a5, 5 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ #unop
+ MUL x1, a6, t2
+ LD a6, 5 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ #unop
+ MUL x1, a7, t3
+ LD a7, 5 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x1, 5 * SIZE(X1)
+ MUL x2, a8, t0
+ LD a8, 6 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ #unop
+ MUL x2, a9, t1
+ LD a9, 6 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ #unop
+ MUL x2, a10, t2
+ LD a10, 6 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ #unop
+ MUL x2, a11, t3
+ LD a11, 6 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x2, 6 * SIZE(X1)
+ MUL x3, a12, t0
+ LD a12, 7 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldi A1, 8 * SIZE(A1)
+ MUL x3, a13, t1
+ LD a13, 7 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ ldi A2, 8 * SIZE(A2)
+ MUL x3, a14, t2
+ LD a14, 7 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ ldi A3, 8 * SIZE(A3)
+ MUL x3, a15, t3
+ LD a15, 7 * SIZE(A4)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x3, 7 * SIZE(X1)
+ MUL x0, a0, t0
+ unop
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldi X1, 8 * SIZE(X1)
+ MUL x0, a1, t1
+ ldi A4, 8 * SIZE(A4)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ MUL x0, a2, t2
+ ADD s3, t3, f20
+ fmov f20,s3
+ MUL x0, a3, t3
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ MUL x1, a4, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x1, a5, t1
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ MUL x1, a6, t2
+ ADD s3, t3, f20
+ fmov f20,s3
+ MUL x1, a7, t3
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ MUL x2, a8, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x2, a9, t1
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ MUL x2, a10, t2
+ ADD s3, t3, f20
+ fmov f20,s3
+ MUL x2, a11, t3
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ MUL x3, a12, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x3, a13, t1
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ MUL x3, a14, t2
+ ADD s3, t3, f20
+ fmov f20,s3
+ MUL x3, a15, t3
+ .align 4
+
+$L15:
+ and M, 7, I
+ ble I, $L18
+
+ LD x0, 0 * SIZE(X1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD a2, 0 * SIZE(A3)
+ LD a3, 0 * SIZE(A4)
+
+ ldi I, -1(I)
+ ble I, $L17
+ .align 4
+
+$L16:
+ ADD s0, t0,f20
+ fmov f20,s0
+ ldi A4, 1 * SIZE(A4)
+ MUL x0, a0, t0
+ LD a0, 1 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldi A1, 1 * SIZE(A1)
+ MUL x0, a1, t1
+ LD a1, 1 * SIZE(A2)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ ldi A2, 1 * SIZE(A2)
+ MUL x0, a2, t2
+ LD a2, 1 * SIZE(A3)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ ldi A3, 1 * SIZE(A3)
+ MUL x0, a3, t3
+ LD a3, 0 * SIZE(A4)
+
+ LD x0, 1 * SIZE(X1)
+ ldi X1, 1 * SIZE(X1)
+ ldi I, -1(I)
+ bgt I, $L16
+ .align 4
+
+$L17:
+ ADD s0, t0,f20
+ fmov f20,s0
+ MUL x0, a0, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x0, a1, t1
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ MUL x0, a2, t2
+ ADD s3, t3, f20
+ fmov f20,s3
+ MUL x0, a3, t3
+ .align 4
+
+$L18:
+ LD a0, 0 * SIZE(Y)
+ addl Y, INCY, Y
+ LD a1, 0 * SIZE(Y)
+ addl Y, INCY, Y
+ LD a2, 0 * SIZE(Y)
+ addl Y, INCY, Y
+ LD a3, 0 * SIZE(Y)
+ addl Y, INCY, Y
+
+ ADD s0, t0,f20
+ fmov f20,s0
+ ADD s1, t1, f20
+ fmov f20,s1
+ ADD s2, t2, f20
+ fmov f20,s2
+ ADD s3, t3, f20
+ fmov f20,s3
+
+ MUL alpha, s0,f20
+ fmov f20,s0
+ MUL alpha, s1, f20
+ fmov f20,s1
+ MUL alpha, s2, f20
+ fmov f20,s2
+ MUL alpha, s3, f20
+ fmov f20,s3
+
+ ADD a0, s0,f20
+ fmov f20,a0
+ fclr t0
+ ADD a1, s1, f20
+ fmov f20,a1
+ fclr t1
+ ADD a2, s2, f20
+ fmov f20,a2
+ fclr t2
+ ADD a3, s3, f20
+ fmov f20,a3
+ fclr t3
+
+ ST a0, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a1, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a2, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a3, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi J, -1(J)
+ bgt J, $L11
+ .align 4
+
+$L20:
+ and N, 2, J
+ ble J, $L30
+ mov A, A1
+ addl A, LDA, A2
+
+ addl A2, LDA, A
+ fclr s0
+ mov X, X1
+ fclr s1
+
+ sra M, 3, I
+ fclr s2
+ fclr s3
+ ble I, $L25
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD a2, 1 * SIZE(A1)
+ LD a3, 1 * SIZE(A2)
+ LD a4, 2 * SIZE(A1)
+ LD a5, 2 * SIZE(A2)
+ LD a6, 3 * SIZE(A1)
+ LD a7, 3 * SIZE(A2)
+
+ LD a8, 4 * SIZE(A1)
+ LD a9, 4 * SIZE(A2)
+ LD a10, 5 * SIZE(A1)
+ LD a11, 5 * SIZE(A2)
+ LD a12, 6 * SIZE(A1)
+ LD a13, 6 * SIZE(A2)
+ LD a14, 7 * SIZE(A1)
+ LD a15, 7 * SIZE(A2)
+
+ LD x0, 0 * SIZE(X1)
+ LD x1, 1 * SIZE(X1)
+ LD x2, 2 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD s0, t0, x3
+ fmov x3,s0
+ LD x3, 3 * SIZE(X1)
+ MUL x0, a0, t0
+ LD a0, 8 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A1)
+ MUL x0, a1, t1
+ LD a1, 8 * SIZE(A2)
+
+ ADD s0, t2, x0
+ fmov x0,s0
+ LD x0, 4 * SIZE(X1)
+ MUL x1, a2, t2
+ LD a2, 9 * SIZE(A1)
+
+ ADD s1, t3, f20
+ fmov f20,s1
+ #unop
+ MUL x1, a3, t3
+ LD a3, 9 * SIZE(A2)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x1, 5 * SIZE(X1)
+ MUL x2, a4, t0
+ LD a4, 10 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldi I, -1(I)
+ MUL x2, a5, t1
+ LD a5, 10 * SIZE(A2)
+
+ ADD s0, t2, f20
+ fmov f20,s0
+ LD x2, 6 * SIZE(X1)
+ MUL x3, a6, t2
+ LD a6, 11 * SIZE(A1)
+
+ ADD s1, t3, f20
+ fmov f20,s1
+ ldi X1, 8 * SIZE(X1)
+ MUL x3, a7, t3
+ LD a7, 11 * SIZE(A2)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x3, -1 * SIZE(X1)
+ MUL x0, a8, t0
+ LD a8, 12 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldw $31, (PREFETCHSIZE + 0) * SIZE(A2)
+ MUL x0, a9, t1
+ LD a9, 12 * SIZE(A2)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x0, 0 * SIZE(X1)
+ MUL x1, a10, t0
+ LD a10, 13 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ ldi A1, 8 * SIZE(A1)
+ MUL x1, a11, t1
+ LD a11, 13 * SIZE(A2)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x1, 1 * SIZE(X1)
+ MUL x2, a12, t0
+ LD a12, 6 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x2, a13, t1
+ LD a13, 14 * SIZE(A2)
+ ldi A2, 8 * SIZE(A2)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x2, 2 * SIZE(X1)
+ MUL x3, a14, t0
+ LD a14, 7 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x3, a15, t1
+ LD a15, 7 * SIZE(A2)
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x3, 3 * SIZE(X1)
+ MUL x0, a0, t0
+ ldi A1, 8 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ unop
+ MUL x0, a1, t1
+ unop
+
+ ADD s0, t2, f20
+ fmov f20,s0
+ LD x0, 4 * SIZE(X1)
+ MUL x1, a2, t2
+ ldi A2, 8 * SIZE(A2)
+
+ ADD s1, t3, f20
+ fmov f20,s1
+ unop
+ MUL x1, a3, t3
+ unop
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x1, 5 * SIZE(X1)
+ MUL x2, a4, t0
+ unop
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ unop
+ MUL x2, a5, t1
+ unop
+
+ ADD s0, t2, f20
+ fmov f20,s0
+ LD x2, 6 * SIZE(X1)
+ MUL x3, a6, t2
+ unop
+
+ ADD s1, t3, f20
+ fmov f20,s1
+ unop
+ MUL x3, a7, t3
+ unop
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD x3, 7 * SIZE(X1)
+ MUL x0, a8, t0
+ ldi X1, 8 * SIZE(X1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ unop
+ MUL x0, a9, t1
+ unop
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ MUL x1, a10, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x1, a11, t1
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ MUL x2, a12, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x2, a13, t1
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ MUL x3, a14, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x3, a15, t1
+ .align 4
+
+$L25:
+ and M, 7, I
+ ble I, $L28
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD x0, 0 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L27
+ .align 4
+
+$L26:
+ ADD s0, t0,f20
+ fmov f20,s0
+ ldi A2, 1 * SIZE(A2)
+ MUL x0, a0, t0
+ LD a0, 1 * SIZE(A1)
+
+ ADD s1, t1,f20
+ fmov f20,s1
+ ldi A1, 1 * SIZE(A1)
+ MUL x0, a1, t1
+ LD a1, 0 * SIZE(A2)
+
+ LD x0, 1 * SIZE(X1)
+ ldi X1, 1 * SIZE(X1)
+ ldi I, -1(I)
+ bgt I, $L26
+ .align 4
+
+$L27:
+ ADD s0, t0, f20
+ fmov f20,s0
+ MUL x0, a0, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL x0, a1, t1
+ .align 4
+
+$L28:
+ LD a0, 0 * SIZE(Y)
+ addl Y, INCY, Y
+ LD a1, 0 * SIZE(Y)
+ addl Y, INCY, Y
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ ADD s1, t1, f20
+ fmov f20,s1
+ ADD s2, t2, f20
+ fmov f20,s2
+ ADD s3, t3, f20
+ fmov f20,s3
+
+ ADD s0, s2, f20
+ fmov f20,s0
+ ADD s1, s3, f20
+ fmov f20,s1
+
+ MUL alpha, s0, f20
+ fmov f20,s0
+ MUL alpha, s1,f20
+ fmov f20,s1
+
+ ADD a0, s0, f20
+ fmov f20,a0
+ ADD a1, s1, f20
+ fmov f20,a1
+
+ ST a0, 0 * SIZE(Y1)
+ fclr t0
+ addl Y1, INCY, Y1
+ fclr t1
+
+ ST a1, 0 * SIZE(Y1)
+ fclr t2
+ addl Y1, INCY, Y1
+ fclr t3
+ .align 4
+
+$L30:
+ blbc N, $L999
+
+ mov A, A1
+ fclr s0
+ mov X, X1
+ fclr s1
+
+ sra M, 3, I
+ fclr s2
+ fclr s3
+ ble I, $L35
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a8, 0 * SIZE(X1)
+ LD a9, 1 * SIZE(X1)
+
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+ LD a10, 2 * SIZE(X1)
+ LD a11, 3 * SIZE(X1)
+
+ LD a4, 4 * SIZE(A1)
+ LD a5, 5 * SIZE(A1)
+ LD a12, 4 * SIZE(X1)
+ LD a13, 5 * SIZE(X1)
+
+ LD a6, 6 * SIZE(A1)
+ LD a7, 7 * SIZE(A1)
+ LD a14, 6 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L33
+ .align 4
+
+$L32:
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD a15, 7 * SIZE(X1)
+ MUL a0, a8, f20
+ fmov f20,t0
+ LD a0, 8 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ LD a8, 8 * SIZE(X1)
+ MUL a1, a9, t1
+ LD a1, 9 * SIZE(A1)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ LD a9, 9 * SIZE(X1)
+ MUL a2, a10, t2
+ LD a2, 10 * SIZE(A1)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ LD a10, 10 * SIZE(X1)
+ MUL a3, a11, t3
+ LD a3, 11 * SIZE(A1)
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD a11, 11 * SIZE(X1)
+ MUL a4, a12, t0
+ LD a4, 12 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ LD a12, 12 * SIZE(X1)
+ MUL a5, a13, t1
+ LD a5, 13 * SIZE(A1)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ LD a13, 13 * SIZE(X1)
+ MUL a6, a14, t2
+ LD a6, 14 * SIZE(A1)
+
+ ADD s3, t3, f20
+ fmov f20,s3
+ LD a14, 14 * SIZE(X1)
+ MUL a7, a15, t3
+ LD a7, 15 * SIZE(A1)
+
+ ldi A1, 8 * SIZE(A1)
+ ldi I, -1(I)
+ ldi X1, 8 * SIZE(X1)
+ bgt I, $L32
+ .align 4
+
+$L33:
+ ADD s0, t0, f20
+ fmov f20,s0
+ LD a15, 7 * SIZE(X1)
+ MUL a0, a8, t0
+ ldi A1, 8 * SIZE(A1)
+
+ ADD s1, t1, f20
+ fmov f20,s1
+ unop
+ MUL a1, a9, t1
+ ldi X1, 8 * SIZE(X1)
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ MUL a2, a10, t2
+ ADD s3, t3, f20
+ fmov f20,s3
+ MUL a3, a11, t3
+
+ ADD s0, t0, f20
+ fmov f20,s0
+ MUL a4, a12, t0
+ ADD s1, t1, f20
+ fmov f20,s1
+ MUL a5, a13, t1
+
+ ADD s2, t2, f20
+ fmov f20,s2
+ MUL a6, a14, t2
+ ADD s3, t3, f20
+ fmov f20,s3
+ MUL a7, a15, t3
+ .align 4
+
+$L35:
+ and M, 7, I
+ ble I, $L38
+
+ LD a0, 0 * SIZE(A1)
+ LD x0, 0 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L37
+ .align 4
+
+$L36:
+ ADD s0, t0,f20
+ fmov f20,s0
+ MUL x0, a0, t0
+ LD a0, 1 * SIZE(A1)
+ LD x0, 1 * SIZE(X1)
+
+ ldi A1, 1 * SIZE(A1)
+ ldi X1, 1 * SIZE(X1)
+ ldi I, -1(I)
+ bgt I, $L36
+ .align 4
+
+$L37:
+ ADD s0, t0,f20
+ fmov f20,s0
+ MUL x0, a0, t0
+ .align 4
+
+$L38:
+ LD a0, 0 * SIZE(Y)
+
+ ADD s0, t0,f20
+ fmov f20,s0
+ ADD s1, t1, f20
+ fmov f20,s1
+ ADD s2, t2, f20
+ fmov f20,s2
+ ADD s3, t3, f20
+ fmov f20,s3
+
+ ADD s0, s2, f20
+ fmov f20,s0
+ ADD s1, s3, f20
+ fmov f20,s1
+ ADD s0, s1, f20
+ fmov f20,s0
+
+ MUL alpha, s0, f20
+ fmov f20,s0
+ ADD a0, s0, f20
+ fmov f20,a0
+
+ ST a0, 0 * SIZE(Y1)
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ fldd f20, 64($sp)
+
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/gemv_t.S.bak b/kernel/sw_64/gemv_t.S.bak
new file mode 100644
index 0000000..068e463
--- /dev/null
+++ b/kernel/sw_64/gemv_t.S.bak
@@ -0,0 +1,1061 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define STACKSIZE 64
+#define PREFETCHSIZE 32
+
+#define M $16
+#define N $17
+#define A $20
+#define LDA $21
+
+#define X $18
+#define INCX $19
+#define Y $22
+#define INCY $23
+
+#define BUFFER $24
+
+#define I $25
+#define J $27
+
+#define X1 $3
+#define Y1 $4
+
+#define A1 $5
+#define A2 $6
+#define A3 $7
+#define A4 $8
+
+#define alpha $f19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f21
+
+#define a0 $f22
+#define a1 $f23
+#define a2 $f24
+#define a3 $f25
+#define a4 $f26
+#define a5 $f27
+#define a6 $f28
+#define a7 $f29
+
+#define a8 $f2
+#define a9 $f3
+#define a10 $f4
+#define a11 $f5
+#define a12 $f6
+#define a13 $f7
+#define a14 $f8
+#define a15 $f9
+
+ PROLOGUE
+
+ ldi $sp, -STACKSIZE($sp)
+ ldl X, 0 + STACKSIZE($sp)
+ ldl INCX, 8 + STACKSIZE($sp)
+ ldl Y, 16 + STACKSIZE($sp)
+ ldl INCY, 24 + STACKSIZE($sp)
+ ldl BUFFER, 32 + STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ PROFCODE
+
+ cmple M, 0, $0
+ SXADDQ INCX, 0, INCX
+ cmple N, 0, $1
+ SXADDQ INCY, 0, INCY
+
+ or $0, $1, $0
+ bne $0, $L999
+
+ cmpeq INCX, SIZE, $0
+ mov X, X1
+ SXADDQ LDA, 0, LDA
+ bne $0, $L10
+
+ sra M, 3, I
+ mov BUFFER, Y1
+ mov BUFFER, X
+ ble I, $L05
+ .align 4
+
+$L02:
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
+ ldi I, -1(I)
+
+ LD a0, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a1, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a2, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a3, 0 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ ST a2, 2 * SIZE(Y1)
+ ST a3, 3 * SIZE(Y1)
+
+ LD a4, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a5, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a6, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a7, 0 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a4, 4 * SIZE(Y1)
+ ST a5, 5 * SIZE(Y1)
+ ST a6, 6 * SIZE(Y1)
+ ST a7, 7 * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ bgt I, $L02
+ .align 4
+
+$L05:
+ and M, 7, I
+ ble I, $L10
+ .align 4
+
+$L06:
+ LD a0, 0 * SIZE(X1)
+ addl X1, INCX, X1
+ ST a0, 0 * SIZE(Y1)
+ addl Y1, SIZE, Y1
+
+ ldi I, -1(I)
+ bgt I, $L06
+ .align 4
+
+$L10:
+ mov Y, Y1
+ fclr t0
+ unop
+ fclr t1
+
+ sra N, 2, J
+ fclr t2
+ fclr t3
+ ble J, $L20
+ .align 4
+
+$L11:
+ mov A, A1
+ fclr s0
+ addl A, LDA, A2
+ fclr s1
+
+ addl A2, LDA, A3
+ fclr s2
+ addl A3, LDA, A4
+ fclr s3
+
+ s4addl LDA, A, A
+ unop
+ mov X, X1
+ fillcs 3 * SIZE(Y)
+
+ sra M, 3, I
+ ble I, $L15
+
+ LD x0, 0 * SIZE(X1)
+ LD x1, 1 * SIZE(X1)
+ LD x2, 2 * SIZE(X1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD a2, 0 * SIZE(A3)
+ LD a3, 0 * SIZE(A4)
+ LD a4, 1 * SIZE(A1)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 1 * SIZE(A3)
+ LD a7, 1 * SIZE(A4)
+ LD a8, 2 * SIZE(A1)
+ LD a9, 2 * SIZE(A2)
+ LD a10, 2 * SIZE(A3)
+ LD a11, 2 * SIZE(A4)
+ LD a12, 3 * SIZE(A1)
+ LD a13, 3 * SIZE(A2)
+ LD a14, 3 * SIZE(A3)
+ LD a15, 3 * SIZE(A4)
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, s0
+ LD x3, 3 * SIZE(X1)
+ MUL x0, a0, t0
+ LD a0, 4 * SIZE(A1)
+
+ ADD s1, t1, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+ MUL x0, a1, t1
+ LD a1, 4 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x0, a2, t2
+ LD a2, 4 * SIZE(A3)
+
+ ADD s3, t3, s3
+ unop
+ MUL x0, a3, t3
+ LD a3, 4 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x0, 4 * SIZE(X1)
+ MUL x1, a4, t0
+ LD a4, 5 * SIZE(A1)
+
+ ADD s1, t1, s1
+ ldi A1, 8 * SIZE(A1)
+ MUL x1, a5, t1
+ LD a5, 5 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x1, a6, t2
+ LD a6, 5 * SIZE(A3)
+
+ ADD s3, t3, s3
+ unop
+ MUL x1, a7, t3
+ LD a7, 5 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x1, 5 * SIZE(X1)
+ MUL x2, a8, t0
+ LD a8, -2 * SIZE(A1)
+
+ ADD s1, t1, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
+ MUL x2, a9, t1
+ LD a9, 6 * SIZE(A2)
+
+ ADD s2, t2, s2
+ ldi A2, 8 * SIZE(A2)
+ MUL x2, a10, t2
+ LD a10, 6 * SIZE(A3)
+
+ ADD s3, t3, s3
+ ldi A3, 8 * SIZE(A3)
+ MUL x2, a11, t3
+ LD a11, 6 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x2, 6 * SIZE(X1)
+ MUL x3, a12, t0
+ LD a12, -1 * SIZE(A1)
+
+ ADD s1, t1, s1
+ ldi A4, 8 * SIZE(A4)
+ MUL x3, a13, t1
+ LD a13, -1 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x3, a14, t2
+ LD a14, -1 * SIZE(A3)
+
+ ADD s3, t3, s3
+ unop
+ MUL x3, a15, t3
+ LD a15, -1 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x3, 7 * SIZE(X1)
+ MUL x0, a0, t0
+ LD a0, 0 * SIZE(A1)
+
+ ADD s1, t1, s1
+ fillcs (PREFETCHSIZE - 8) * SIZE(A3)
+ MUL x0, a1, t1
+ LD a1, 0 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x0, a2, t2
+ LD a2, 0 * SIZE(A3)
+
+ ADD s3, t3, s3
+ unop
+ MUL x0, a3, t3
+ LD a3, 0 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x0, 8 * SIZE(X1)
+ MUL x1, a4, t0
+ LD a4, 1 * SIZE(A1)
+
+ ADD s1, t1, s1
+ unop
+ MUL x1, a5, t1
+ LD a5, 1 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x1, a6, t2
+ LD a6, 1 * SIZE(A3)
+
+ ADD s3, t3, s3
+ unop
+ MUL x1, a7, t3
+ LD a7, 1 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x1, 9 * SIZE(X1)
+ MUL x2, a8, t0
+ LD a8, 2 * SIZE(A1)
+
+ ADD s1, t1, s1
+ fillcs (PREFETCHSIZE - 8) * SIZE(A4)
+ MUL x2, a9, t1
+ LD a9, 2 * SIZE(A2)
+
+ ADD s2, t2, s2
+ ldi X1, 8 * SIZE(X1)
+ MUL x2, a10, t2
+ LD a10, 2 * SIZE(A3)
+
+ ADD s3, t3, s3
+ ldi I, -1(I)
+ MUL x2, a11, t3
+ LD a11, 2 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x2, 2 * SIZE(X1)
+ MUL x3, a12, t0
+ LD a12, 3 * SIZE(A1)
+
+ ADD s1, t1, s1
+ fillcs (PREFETCHSIZE - 8) * SIZE(X1)
+ MUL x3, a13, t1
+ LD a13, 3 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x3, a14, t2
+ LD a14, 3 * SIZE(A3)
+
+ ADD s3, t3, s3
+ MUL x3, a15, t3
+ LD a15, 3 * SIZE(A4)
+ bgt I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, s0
+ LD x3, 3 * SIZE(X1)
+ MUL x0, a0, t0
+ LD a0, 4 * SIZE(A1)
+
+ ADD s1, t1, s1
+ unop
+ MUL x0, a1, t1
+ LD a1, 4 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x0, a2, t2
+ LD a2, 4 * SIZE(A3)
+
+ ADD s3, t3, s3
+ unop
+ MUL x0, a3, t3
+ LD a3, 4 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x0, 4 * SIZE(X1)
+ MUL x1, a4, t0
+ LD a4, 5 * SIZE(A1)
+
+ ADD s1, t1, s1
+ unop
+ MUL x1, a5, t1
+ LD a5, 5 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x1, a6, t2
+ LD a6, 5 * SIZE(A3)
+
+ ADD s3, t3, s3
+ unop
+ MUL x1, a7, t3
+ LD a7, 5 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x1, 5 * SIZE(X1)
+ MUL x2, a8, t0
+ LD a8, 6 * SIZE(A1)
+
+ ADD s1, t1, s1
+ unop
+ MUL x2, a9, t1
+ LD a9, 6 * SIZE(A2)
+
+ ADD s2, t2, s2
+ unop
+ MUL x2, a10, t2
+ LD a10, 6 * SIZE(A3)
+
+ ADD s3, t3, s3
+ unop
+ MUL x2, a11, t3
+ LD a11, 6 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x2, 6 * SIZE(X1)
+ MUL x3, a12, t0
+ LD a12, 7 * SIZE(A1)
+
+ ADD s1, t1, s1
+ ldi A1, 8 * SIZE(A1)
+ MUL x3, a13, t1
+ LD a13, 7 * SIZE(A2)
+
+ ADD s2, t2, s2
+ ldi A2, 8 * SIZE(A2)
+ MUL x3, a14, t2
+ LD a14, 7 * SIZE(A3)
+
+ ADD s3, t3, s3
+ ldi A3, 8 * SIZE(A3)
+ MUL x3, a15, t3
+ LD a15, 7 * SIZE(A4)
+
+ ADD s0, t0, s0
+ LD x3, 7 * SIZE(X1)
+ MUL x0, a0, t0
+ unop
+
+ ADD s1, t1, s1
+ ldi X1, 8 * SIZE(X1)
+ MUL x0, a1, t1
+ ldi A4, 8 * SIZE(A4)
+
+ ADD s2, t2, s2
+ MUL x0, a2, t2
+ ADD s3, t3, s3
+ MUL x0, a3, t3
+
+ ADD s0, t0, s0
+ MUL x1, a4, t0
+ ADD s1, t1, s1
+ MUL x1, a5, t1
+
+ ADD s2, t2, s2
+ MUL x1, a6, t2
+ ADD s3, t3, s3
+ MUL x1, a7, t3
+
+ ADD s0, t0, s0
+ MUL x2, a8, t0
+ ADD s1, t1, s1
+ MUL x2, a9, t1
+
+ ADD s2, t2, s2
+ MUL x2, a10, t2
+ ADD s3, t3, s3
+ MUL x2, a11, t3
+
+ ADD s0, t0, s0
+ MUL x3, a12, t0
+ ADD s1, t1, s1
+ MUL x3, a13, t1
+
+ ADD s2, t2, s2
+ MUL x3, a14, t2
+ ADD s3, t3, s3
+ MUL x3, a15, t3
+ .align 4
+
+$L15:
+ and M, 7, I
+ ble I, $L18
+
+ LD x0, 0 * SIZE(X1)
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD a2, 0 * SIZE(A3)
+ LD a3, 0 * SIZE(A4)
+
+ ldi I, -1(I)
+ ble I, $L17
+ .align 4
+
+$L16:
+ ADD s0, t0, s0
+ ldi A4, 1 * SIZE(A4)
+ MUL x0, a0, t0
+ LD a0, 1 * SIZE(A1)
+
+ ADD s1, t1, s1
+ ldi A1, 1 * SIZE(A1)
+ MUL x0, a1, t1
+ LD a1, 1 * SIZE(A2)
+
+ ADD s2, t2, s2
+ ldi A2, 1 * SIZE(A2)
+ MUL x0, a2, t2
+ LD a2, 1 * SIZE(A3)
+
+ ADD s3, t3, s3
+ ldi A3, 1 * SIZE(A3)
+ MUL x0, a3, t3
+ LD a3, 0 * SIZE(A4)
+
+ LD x0, 1 * SIZE(X1)
+ ldi X1, 1 * SIZE(X1)
+ ldi I, -1(I)
+ bgt I, $L16
+ .align 4
+
+$L17:
+ ADD s0, t0, s0
+ MUL x0, a0, t0
+ ADD s1, t1, s1
+ MUL x0, a1, t1
+
+ ADD s2, t2, s2
+ MUL x0, a2, t2
+ ADD s3, t3, s3
+ MUL x0, a3, t3
+ .align 4
+
+$L18:
+ LD a0, 0 * SIZE(Y)
+ addl Y, INCY, Y
+ LD a1, 0 * SIZE(Y)
+ addl Y, INCY, Y
+ LD a2, 0 * SIZE(Y)
+ addl Y, INCY, Y
+ LD a3, 0 * SIZE(Y)
+ addl Y, INCY, Y
+
+ ADD s0, t0, s0
+ ADD s1, t1, s1
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+ MUL alpha, s0, s0
+ MUL alpha, s1, s1
+ MUL alpha, s2, s2
+ MUL alpha, s3, s3
+
+ ADD a0, s0, a0
+ fclr t0
+ ADD a1, s1, a1
+ fclr t1
+ ADD a2, s2, a2
+ fclr t2
+ ADD a3, s3, a3
+ fclr t3
+
+ ST a0, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a1, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a2, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a3, 0 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi J, -1(J)
+ bgt J, $L11
+ .align 4
+
+$L20:
+ and N, 2, J
+ ble J, $L30
+ mov A, A1
+ addl A, LDA, A2
+
+ addl A2, LDA, A
+ fclr s0
+ mov X, X1
+ fclr s1
+
+ sra M, 3, I
+ fclr s2
+ fclr s3
+ ble I, $L25
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD a2, 1 * SIZE(A1)
+ LD a3, 1 * SIZE(A2)
+ LD a4, 2 * SIZE(A1)
+ LD a5, 2 * SIZE(A2)
+ LD a6, 3 * SIZE(A1)
+ LD a7, 3 * SIZE(A2)
+
+ LD a8, 4 * SIZE(A1)
+ LD a9, 4 * SIZE(A2)
+ LD a10, 5 * SIZE(A1)
+ LD a11, 5 * SIZE(A2)
+ LD a12, 6 * SIZE(A1)
+ LD a13, 6 * SIZE(A2)
+ LD a14, 7 * SIZE(A1)
+ LD a15, 7 * SIZE(A2)
+
+ LD x0, 0 * SIZE(X1)
+ LD x1, 1 * SIZE(X1)
+ LD x2, 2 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD s0, t0, s0
+ LD x3, 3 * SIZE(X1)
+ MUL x0, a0, t0
+ LD a0, 8 * SIZE(A1)
+
+ ADD s1, t1, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+ MUL x0, a1, t1
+ LD a1, 8 * SIZE(A2)
+
+ ADD s0, t2, s0
+ LD x0, 4 * SIZE(X1)
+ MUL x1, a2, t2
+ LD a2, 9 * SIZE(A1)
+
+ ADD s1, t3, s1
+ unop
+ MUL x1, a3, t3
+ LD a3, 9 * SIZE(A2)
+
+ ADD s0, t0, s0
+ LD x1, 5 * SIZE(X1)
+ MUL x2, a4, t0
+ LD a4, 10 * SIZE(A1)
+
+ ADD s1, t1, s1
+ ldi I, -1(I)
+ MUL x2, a5, t1
+ LD a5, 10 * SIZE(A2)
+
+ ADD s0, t2, s0
+ LD x2, 6 * SIZE(X1)
+ MUL x3, a6, t2
+ LD a6, 11 * SIZE(A1)
+
+ ADD s1, t3, s1
+ ldi X1, 8 * SIZE(X1)
+ MUL x3, a7, t3
+ LD a7, 11 * SIZE(A2)
+
+ ADD s0, t0, s0
+ LD x3, -1 * SIZE(X1)
+ MUL x0, a8, t0
+ LD a8, 12 * SIZE(A1)
+
+ ADD s1, t1, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
+ MUL x0, a9, t1
+ LD a9, 12 * SIZE(A2)
+
+ ADD s0, t0, s0
+ LD x0, 0 * SIZE(X1)
+ MUL x1, a10, t0
+ LD a10, 13 * SIZE(A1)
+
+ ADD s1, t1, s1
+ ldi A1, 8 * SIZE(A1)
+ MUL x1, a11, t1
+ LD a11, 13 * SIZE(A2)
+
+ ADD s0, t0, s0
+ LD x1, 1 * SIZE(X1)
+ MUL x2, a12, t0
+ LD a12, 6 * SIZE(A1)
+
+ ADD s1, t1, s1
+ MUL x2, a13, t1
+ LD a13, 14 * SIZE(A2)
+ ldi A2, 8 * SIZE(A2)
+
+ ADD s0, t0, s0
+ LD x2, 2 * SIZE(X1)
+ MUL x3, a14, t0
+ LD a14, 7 * SIZE(A1)
+
+ ADD s1, t1, s1
+ MUL x3, a15, t1
+ LD a15, 7 * SIZE(A2)
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD s0, t0, s0
+ LD x3, 3 * SIZE(X1)
+ MUL x0, a0, t0
+ ldi A1, 8 * SIZE(A1)
+
+ ADD s1, t1, s1
+ unop
+ MUL x0, a1, t1
+ unop
+
+ ADD s0, t2, s0
+ LD x0, 4 * SIZE(X1)
+ MUL x1, a2, t2
+ ldi A2, 8 * SIZE(A2)
+
+ ADD s1, t3, s1
+ unop
+ MUL x1, a3, t3
+ unop
+
+ ADD s0, t0, s0
+ LD x1, 5 * SIZE(X1)
+ MUL x2, a4, t0
+ unop
+
+ ADD s1, t1, s1
+ unop
+ MUL x2, a5, t1
+ unop
+
+ ADD s0, t2, s0
+ LD x2, 6 * SIZE(X1)
+ MUL x3, a6, t2
+ unop
+
+ ADD s1, t3, s1
+ unop
+ MUL x3, a7, t3
+ unop
+
+ ADD s0, t0, s0
+ LD x3, 7 * SIZE(X1)
+ MUL x0, a8, t0
+ ldi X1, 8 * SIZE(X1)
+
+ ADD s1, t1, s1
+ unop
+ MUL x0, a9, t1
+ unop
+
+ ADD s0, t0, s0
+ MUL x1, a10, t0
+ ADD s1, t1, s1
+ MUL x1, a11, t1
+
+ ADD s0, t0, s0
+ MUL x2, a12, t0
+ ADD s1, t1, s1
+ MUL x2, a13, t1
+
+ ADD s0, t0, s0
+ MUL x3, a14, t0
+ ADD s1, t1, s1
+ MUL x3, a15, t1
+ .align 4
+
+$L25:
+ and M, 7, I
+ ble I, $L28
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 0 * SIZE(A2)
+ LD x0, 0 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L27
+ .align 4
+
+$L26:
+ ADD s0, t0, s0
+ ldi A2, 1 * SIZE(A2)
+ MUL x0, a0, t0
+ LD a0, 1 * SIZE(A1)
+
+ ADD s1, t1, s1
+ ldi A1, 1 * SIZE(A1)
+ MUL x0, a1, t1
+ LD a1, 0 * SIZE(A2)
+
+ LD x0, 1 * SIZE(X1)
+ ldi X1, 1 * SIZE(X1)
+ ldi I, -1(I)
+ bgt I, $L26
+ .align 4
+
+$L27:
+ ADD s0, t0, s0
+ MUL x0, a0, t0
+ ADD s1, t1, s1
+ MUL x0, a1, t1
+ .align 4
+
+$L28:
+ LD a0, 0 * SIZE(Y)
+ addl Y, INCY, Y
+ LD a1, 0 * SIZE(Y)
+ addl Y, INCY, Y
+
+ ADD s0, t0, s0
+ ADD s1, t1, s1
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+ ADD s0, s2, s0
+ ADD s1, s3, s1
+
+ MUL alpha, s0, s0
+ MUL alpha, s1, s1
+
+ ADD a0, s0, a0
+ ADD a1, s1, a1
+
+ ST a0, 0 * SIZE(Y1)
+ fclr t0
+ addl Y1, INCY, Y1
+ fclr t1
+
+ ST a1, 0 * SIZE(Y1)
+ fclr t2
+ addl Y1, INCY, Y1
+ fclr t3
+ .align 4
+
+$L30:
+ blbc N, $L999
+
+ mov A, A1
+ fclr s0
+ mov X, X1
+ fclr s1
+
+ sra M, 3, I
+ fclr s2
+ fclr s3
+ ble I, $L35
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a8, 0 * SIZE(X1)
+ LD a9, 1 * SIZE(X1)
+
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+ LD a10, 2 * SIZE(X1)
+ LD a11, 3 * SIZE(X1)
+
+ LD a4, 4 * SIZE(A1)
+ LD a5, 5 * SIZE(A1)
+ LD a12, 4 * SIZE(X1)
+ LD a13, 5 * SIZE(X1)
+
+ LD a6, 6 * SIZE(A1)
+ LD a7, 7 * SIZE(A1)
+ LD a14, 6 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L33
+ .align 4
+
+$L32:
+ ADD s0, t0, s0
+ LD a15, 7 * SIZE(X1)
+ MUL a0, a8, t0
+ LD a0, 8 * SIZE(A1)
+
+ ADD s1, t1, s1
+ LD a8, 8 * SIZE(X1)
+ MUL a1, a9, t1
+ LD a1, 9 * SIZE(A1)
+
+ ADD s2, t2, s2
+ LD a9, 9 * SIZE(X1)
+ MUL a2, a10, t2
+ LD a2, 10 * SIZE(A1)
+
+ ADD s3, t3, s3
+ LD a10, 10 * SIZE(X1)
+ MUL a3, a11, t3
+ LD a3, 11 * SIZE(A1)
+
+ ADD s0, t0, s0
+ LD a11, 11 * SIZE(X1)
+ MUL a4, a12, t0
+ LD a4, 12 * SIZE(A1)
+
+ ADD s1, t1, s1
+ LD a12, 12 * SIZE(X1)
+ MUL a5, a13, t1
+ LD a5, 13 * SIZE(A1)
+
+ ADD s2, t2, s2
+ LD a13, 13 * SIZE(X1)
+ MUL a6, a14, t2
+ LD a6, 14 * SIZE(A1)
+
+ ADD s3, t3, s3
+ LD a14, 14 * SIZE(X1)
+ MUL a7, a15, t3
+ LD a7, 15 * SIZE(A1)
+
+ ldi A1, 8 * SIZE(A1)
+ ldi I, -1(I)
+ ldi X1, 8 * SIZE(X1)
+ bgt I, $L32
+ .align 4
+
+$L33:
+ ADD s0, t0, s0
+ LD a15, 7 * SIZE(X1)
+ MUL a0, a8, t0
+ ldi A1, 8 * SIZE(A1)
+
+ ADD s1, t1, s1
+ unop
+ MUL a1, a9, t1
+ ldi X1, 8 * SIZE(X1)
+
+ ADD s2, t2, s2
+ MUL a2, a10, t2
+ ADD s3, t3, s3
+ MUL a3, a11, t3
+
+ ADD s0, t0, s0
+ MUL a4, a12, t0
+ ADD s1, t1, s1
+ MUL a5, a13, t1
+
+ ADD s2, t2, s2
+ MUL a6, a14, t2
+ ADD s3, t3, s3
+ MUL a7, a15, t3
+ .align 4
+
+$L35:
+ and M, 7, I
+ ble I, $L38
+
+ LD a0, 0 * SIZE(A1)
+ LD x0, 0 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L37
+ .align 4
+
+$L36:
+ ADD s0, t0, s0
+ MUL x0, a0, t0
+ LD a0, 1 * SIZE(A1)
+ LD x0, 1 * SIZE(X1)
+
+ ldi A1, 1 * SIZE(A1)
+ ldi X1, 1 * SIZE(X1)
+ ldi I, -1(I)
+ bgt I, $L36
+ .align 4
+
+$L37:
+ ADD s0, t0, s0
+ MUL x0, a0, t0
+ .align 4
+
+$L38:
+ LD a0, 0 * SIZE(Y)
+
+ ADD s0, t0, s0
+ ADD s1, t1, s1
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+ ADD s0, s2, s0
+ ADD s1, s3, s1
+ ADD s0, s1, s0
+
+ MUL alpha, s0, s0
+ ADD a0, s0, a0
+
+ ST a0, 0 * SIZE(Y1)
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/iamax.S b/kernel/sw_64/iamax.S
new file mode 100644
index 0000000..f3b2909
--- /dev/null
+++ b/kernel/sw_64/iamax.S
@@ -0,0 +1,440 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#endif
+
+#define STACKSIZE 6 * 8
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef F_INTERFACE
+ ldl N, 0(N) # n
+ ldl INCX, 0(INCX) # incx
+#endif
+ ldi $sp, -STACKSIZE($sp)
+ mov X, XX
+ .align 4
+
+ fstd $f2, 0($sp)
+ fclr $f16
+ cmplt $31, N, $2
+ unop
+
+ fstd $f3, 8($sp)
+ fclr $f17
+ cmplt $31, INCX, $3
+ unop
+
+ fstd $f4, 16($sp)
+ fclr $f18
+ SXADDQ INCX, $31, INCX
+ unop
+
+ fstd $f5, 24($sp)
+ fclr $f19
+ and $2, $3, $2
+ clr $0
+
+ fstd $f6, 32($sp)
+ fclr $f0
+ sra N, 3, $1
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ unop
+ fabs $f20, $f0
+ ble $1, $L15
+ .align 4
+
+ fabs $f20, $f1
+ unop
+ addl X, INCX, X
+ unop
+
+ LD $f21, 0 * SIZE(X)
+ fabs $f20, $f2
+ addl X, INCX, X
+ unop
+
+ LD $f22, 0 * SIZE(X)
+ fabs $f20, $f3
+ addl X, INCX, X
+ unop
+
+ LD $f23, 0 * SIZE(X)
+ fabs $f20, $f4
+ addl X, INCX, X
+ unop
+
+ LD $f24, 0 * SIZE(X)
+ addl X, INCX, X
+ fabs $f20, $f5
+ unop
+
+ LD $f25, 0 * SIZE(X)
+ fabs $f20, $f6
+ addl X, INCX, X
+ unop
+
+ LD $f26, 0 * SIZE(X)
+ fabs $f20, $f28
+ addl X, INCX, X
+ ldi $1, -1($1)
+
+ LD $f27, 0 * SIZE(X)
+ unop
+ addl X, INCX, X
+ ble $1, $L13
+ .align 4
+
+$L12:
+ fselne $f16, $f12, $f4, $f4
+ unop
+ fabs $f20, $f29
+ fillcs 56 * SIZE(X)
+
+ fselne $f17, $f13, $f5, $f5
+ LD $f20, 0 * SIZE(X)
+ fabs $f21, $f30
+ addl X, INCX, X
+
+ fselne $f18, $f14, $f6, $f6
+ LD $f21, 0 * SIZE(X)
+ fabs $f22, $f10
+ addl X, INCX, X
+
+ fselne $f19, $f15, $f28, $f28
+ LD $f22, 0 * SIZE(X)
+ fabs $f23, $f11
+ addl X, INCX, X
+
+ fabs $f24, $f12
+ LD $f23, 0 * SIZE(X)
+ CMPLT($f0, $f29), $f16
+ addl X, INCX, X
+
+ fabs $f25, $f13
+ LD $f24, 0 * SIZE(X)
+ CMPLT($f1, $f30), $f17
+ addl X, INCX, X
+
+ fabs $f26, $f14
+ LD $f25, 0 * SIZE(X)
+ CMPLT($f2, $f10), $f18
+ addl X, INCX, X
+
+ fabs $f27, $f15
+ LD $f26, 0 * SIZE(X)
+ CMPLT($f3, $f11), $f19
+ addl X, INCX, X
+
+ fselne $f16, $f29, $f0, $f0
+ LD $f27, 0 * SIZE(X)
+ CMPLT($f4, $f12), $f16
+ addl X, INCX, X
+
+ fselne $f17, $f30, $f1, $f1
+ unop
+ CMPLT($f5, $f13), $f17
+ ldi $1, -1($1) # i --
+
+ fselne $f18, $f10, $f2, $f2
+ unop
+ CMPLT($f6, $f14), $f18
+ unop
+
+ fselne $f19, $f11, $f3, $f3
+ unop
+ CMPLT($f28, $f15), $f19
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ fselne $f16, $f12, $f4, $f4
+ fabs $f20, $f29
+ fselne $f17, $f13, $f5, $f5
+ fabs $f21, $f30
+
+ fselne $f18, $f14, $f6, $f6
+ fabs $f22, $f10
+ fselne $f19, $f15, $f28, $f28
+ fabs $f23, $f11
+
+ fabs $f24, $f12
+ CMPLT($f0, $f29), $f16
+ fabs $f25, $f13
+ CMPLT($f1, $f30), $f17
+
+ fabs $f26, $f14
+ CMPLT($f2, $f10), $f18
+ fabs $f27, $f15
+ CMPLT($f3, $f11), $f19
+
+ fselne $f16, $f29, $f0, $f0
+ CMPLT($f4, $f12), $f16
+ fselne $f17, $f30, $f1, $f1
+ CMPLT($f5, $f13), $f17
+
+ fselne $f18, $f10, $f2, $f2
+ CMPLT($f6, $f14), $f18
+ fselne $f19, $f11, $f3, $f3
+ CMPLT($f28, $f15), $f19
+
+ fselne $f16, $f12, $f4, $f4
+ CMPLT($f0, $f1), $f16
+ fselne $f17, $f13, $f5, $f5
+ CMPLT($f2, $f3), $f17
+
+ fselne $f18, $f14, $f6, $f6
+ CMPLT($f4, $f5), $f18
+ fselne $f19, $f15, $f28, $f28
+ CMPLT($f6, $f28), $f19
+
+ fselne $f16, $f1, $f0, $f0
+ fselne $f17, $f3, $f2, $f2
+ fselne $f18, $f5, $f4, $f4
+ fselne $f19, $f28, $f6, $f6
+
+ CMPLT($f0, $f2), $f16
+ CMPLT($f4, $f6), $f17
+
+ fselne $f16, $f2, $f0, $f0
+ fselne $f17, $f6, $f4, $f4
+
+ CMPLT($f0, $f4), $f16
+ fselne $f16, $f4, $f0, $f0
+ .align 4
+
+$L15:
+ and N, 7, $1
+ unop
+ unop
+ ble $1, $L20
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ CMPLT($f0, $f29), $f16
+ fselne $f16, $f29, $f0, $f0
+
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$L20:
+ sra N, 3, $1
+ ble $1, $L40
+ .align 4
+
+ LD $f10, 0 * SIZE(XX)
+ addl XX, INCX, XX
+ LD $f11, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ addl XX, INCX, XX
+ LD $f13, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ addl XX, INCX, XX
+ LD $f15, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ addl XX, INCX, XX
+ LD $f17, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+ fabs $f12, $f20
+ fabs $f13, $f21
+
+ ldi $1, -1($1)
+ ble $1, $L23
+ .align 4
+
+$L22:
+ LD $f10, 0 * SIZE(XX)
+ fabs $f14, $f22
+ addl XX, INCX, XX
+ fcmpeq $f0, $f18, $f2
+
+ LD $f11, 0 * SIZE(XX)
+ fabs $f15, $f23
+ addl XX, INCX, XX
+ fcmpeq $f0, $f19, $f3
+
+ LD $f12, 0 * SIZE(XX)
+ fabs $f16, $f24
+ addl XX, INCX, XX
+ fcmpeq $f0, $f20, $f4
+
+ LD $f13, 0 * SIZE(XX)
+ fabs $f17, $f25
+ addl XX, INCX, XX
+ fcmpeq $f0, $f21, $f5
+
+ LD $f14, 0 * SIZE(XX)
+ ldi $1, -1($1) # i --
+ fcmpeq $f0, $f22, $f26
+ addl XX, INCX, XX
+
+ ldi $0, 1($0)
+ fbne $f2, $End
+
+ LD $f15, 0 * SIZE(XX)
+ fcmpeq $f0, $f23, $f27
+ ldi $0, 1($0)
+ fbne $f3, $End
+
+ addl XX, INCX, XX
+ fcmpeq $f0, $f24, $f28
+ ldi $0, 1($0)
+ fbne $f4, $End
+
+ LD $f16, 0 * SIZE(XX)
+ fcmpeq $f0, $f25, $f29
+ ldi $0, 1($0)
+ fbne $f5, $End
+
+ addl XX, INCX, XX
+ ldi $0, 1($0)
+ fabs $f10, $f18
+ fbne $f26, $End
+
+ LD $f17, 0 * SIZE(XX)
+ ldi $0, 1($0)
+ fabs $f11, $f19
+ fbne $f27, $End
+
+ addl XX, INCX, XX
+ ldi $0, 1($0)
+ fabs $f12, $f20
+ fbne $f28, $End
+
+ ldi $0, 1($0)
+ fabs $f13, $f21
+ fbne $f29, $End
+ bgt $1, $L22
+ .align 4
+
+$L23:
+ fabs $f14, $f22
+ fcmpeq $f0, $f18, $f2
+ fabs $f15, $f23
+ fcmpeq $f0, $f19, $f3
+
+ fabs $f16, $f24
+ fcmpeq $f0, $f20, $f4
+ fabs $f17, $f25
+ fcmpeq $f0, $f21, $f5
+
+ fcmpeq $f0, $f22, $f26
+ ldi $0, 1($0)
+ unop
+ fbne $f2, $End
+
+ fcmpeq $f0, $f23, $f27
+ ldi $0, 1($0)
+ unop
+ fbne $f3, $End
+
+ fcmpeq $f0, $f24, $f28
+ ldi $0, 1($0)
+ unop
+ fbne $f4, $End
+
+ fcmpeq $f0, $f25, $f29
+ ldi $0, 1($0)
+ unop
+ fbne $f5, $End
+
+ ldi $0, 1($0)
+ fbne $f26, $End
+ ldi $0, 1($0)
+ fbne $f27, $End
+ ldi $0, 1($0)
+ fbne $f28, $End
+ ldi $0, 1($0)
+ fbne $f29, $End
+ .align 4
+
+$L40:
+ LD $f20, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ fabs $f20, $f25
+ fcmpeq $f0, $f25, $f29
+
+ ldi $0, 1($0)
+ fbne $f29, $End
+ br $31, $L40
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/iamax_simd.S b/kernel/sw_64/iamax_simd.S
new file mode 100644
index 0000000..c7c6c27
--- /dev/null
+++ b/kernel/sw_64/iamax_simd.S
@@ -0,0 +1,732 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 96
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+#define I $1
+#define NN $22
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#endif
+
+#ifndef USE_MIN
+#define VCMPLT(a, b) vfcmplt a, b
+#else
+#define VCMPLT(a, b) vfcmplt b, a
+#endif
+
+#define STACKSIZE 6 * 8
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef F_INTERFACE
+ ldl N, 0(N) # n
+ ldl INCX, 0(INCX) # incx
+#endif
+ ldi $sp, -STACKSIZE($sp)
+ mov X, XX
+ mov N, NN
+ .align 4
+
+ fstd $f2, 0($sp)
+ fclr $f16
+ cmplt $31, N, $2
+ unop
+
+ fstd $f3, 8($sp)
+ fclr $f17
+ cmplt $31, INCX, $3
+ unop
+
+ fstd $f4, 16($sp)
+ fclr $f18
+ SXADDQ INCX, $31, INCX
+ unop
+
+ fstd $f5, 24($sp)
+ fclr $f19
+ and $2, $3, $2
+ clr $0
+
+ fstd $f6, 32($sp)
+ fclr $f0
+ unop
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ cmpeq INCX, SIZE, $3
+ beq $3, $Sub
+ .align 4
+
+
+/**
+ test the address of Y
+**/
+
+ and X, (VEC_LEN*SIZE-1), $3
+ LD $f10, 0*SIZE(X)
+ fabs $f10, $f0 # init temp max/min result value
+ beq $3, $Align_Access
+ .align 4
+/**
+ process the unalign address of X
+**/
+
+/*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/
+ sra NN, 4, I
+ and NN, 15, $3
+ ble I, $Remain
+ nop
+
+ sra $3, BASE_SHIFT, $3
+ ldi $2, VEC_LEN
+ subl $2, $3, $3
+ nop
+$UnAlign_Y_Loop:
+ LD $f10, 0*SIZE(X)
+ addl X, SIZE, X
+ fabs $f10, $f29
+ CMPLT($f0, $f29), $f16
+
+ fseleq $f16, $f0, $f29, $f0
+ subl $3, 1, $3
+ subl NN, 1, NN
+ bgt $3, $UnAlign_Y_Loop
+ .align 4
+
+
+$Align_Access:
+/*search max or min. Unloop 16 */
+ sra NN, 4, I
+ and NN, 15, $3
+ ble I, $Remain
+ nop
+
+ VLD $f10, 0*VEC_LEN*SIZE(X)
+ VLD $f11, 1*VEC_LEN*SIZE(X)
+ VLD $f12, 2*VEC_LEN*SIZE(X)
+ VLD $f13, 3*VEC_LEN*SIZE(X)
+
+ /*vfabs*/
+ vcpys $f31, $f10, $f22
+ vcpys $f31, $f11, $f23
+ vcpys $f31, $f12, $f24
+ vcpys $f31, $f13, $f25
+
+ vcpyf $f0, $f0
+ vcpys $f22, $f22, $f1 # copy $f22 -> $f1
+ vcpys $f22, $f22, $f2
+ vcpys $f22, $f22, $f3
+
+ subl I, 1, I
+ addl X, 16*SIZE, X
+ nop
+ ble I, $MainLoopEnd
+ .align 4
+$MainLoop:
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ VCMPLT($f0, $f22), $f26
+ subl I, 1, I
+ VCMPLT($f1, $f23), $f27
+
+ VLD $f10, 0*VEC_LEN*SIZE(X)
+ VLD $f11, 1*VEC_LEN*SIZE(X)
+ VLD $f12, 2*VEC_LEN*SIZE(X)
+ VLD $f13, 3*VEC_LEN*SIZE(X)
+
+ VCMPLT($f2, $f24), $f28
+ addl X, 16 * SIZE, X
+ nop
+ VCMPLT($f3, $f25), $f29
+
+ vfseleq $f26, $f0, $f22, $f0
+ vfseleq $f27, $f1, $f23, $f1
+ vfseleq $f28, $f2, $f24, $f2
+ vfseleq $f29, $f3, $f25, $f3
+
+ vcpys $f31, $f10, $f22
+ vcpys $f31, $f11, $f23
+ vcpys $f31, $f12, $f24
+ vcpys $f31, $f13, $f25
+
+ bne I, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+ VCMPLT($f0, $f22), $f26
+ VCMPLT($f1, $f23), $f27
+ VCMPLT($f2, $f24), $f28
+ VCMPLT($f3, $f25), $f29
+
+ vfseleq $f26, $f0, $f22, $f0
+ vfseleq $f27, $f1, $f23, $f1
+ vfseleq $f28, $f2, $f24, $f2
+ vfseleq $f29, $f3, $f25, $f3
+
+ /*find the max or min among f0, f1 ,f2 and f3*/
+ VCMPLT($f0, $f1), $f26
+ VCMPLT($f2, $f3), $f27
+ vfseleq $f26, $f0, $f1, $f0
+ vfseleq $f27, $f2, $f3, $f2
+
+ VCMPLT($f0, $f2), $f26
+ vfseleq $f26, $f0, $f2, $f0
+ vextf $f0, 1, $f22
+ vextf $f0, 2, $f23
+
+ vextf $f0, 3, $f24
+ CMPLT($f0, $f22), $f16
+ CMPLT($f23, $f24), $f17
+ fseleq $f16, $f0, $f22, $f0
+
+ fseleq $f17, $f23, $f24, $f23
+ CMPLT($f0, $f23), $f18
+ fseleq $f18, $f0, $f23, $f0
+ nop
+$Remain:
+ ble $3, $Continuous_FindIndex
+ .align 4
+$RemainLoop:
+ LD $f20, 0 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ CMPLT($f0, $f29), $f16
+ fseleq $f16, $f0, $f29, $f0
+
+ subl $3, 1, $3
+ bgt $3, $RemainLoop
+ .align 4
+ /*find index*/
+$Continuous_FindIndex:
+ sra N, 3, $1
+ ble $1, $L40
+ .align 4
+
+ LD $f10, 0 * SIZE(XX)
+ LD $f11, 1 * SIZE(XX)
+ LD $f12, 2 * SIZE(XX)
+ LD $f13, 3 * SIZE(XX)
+
+
+ LD $f14, 4 * SIZE(XX)
+ LD $f15, 5 * SIZE(XX)
+ LD $f16, 6 * SIZE(XX)
+ LD $f17, 7 * SIZE(XX)
+
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+ fabs $f12, $f20
+ fabs $f13, $f21
+
+ addl XX, 8*SIZE, XX
+ ldi $1, -1($1)
+ ble $1, $Continuous_FindIndex_Loop
+ .align 4
+
+$Continuous_FindIndex_Loop:
+ LD $f10, 0 * SIZE(XX)
+ fabs $f14, $f22
+ LD $f11, 1 * SIZE(XX)
+ fcmpeq $f0, $f18, $f2
+
+ LD $f12, 2 * SIZE(XX)
+ fabs $f15, $f23
+ LD $f13, 3 * SIZE(XX)
+ fcmpeq $f0, $f19, $f3
+
+ LD $f14, 4 * SIZE(XX)
+ fabs $f16, $f24
+ ldi $1, -1($1) # i --
+ fcmpeq $f0, $f20, $f4
+
+ LD $f15, 5 * SIZE(XX)
+ fabs $f17, $f25
+ fcmpeq $f0, $f21, $f5
+ fillcs PREFETCHSIZE * SIZE(X)
+
+ LD $f16, 6 * SIZE(XX)
+ fcmpeq $f0, $f22, $f26
+ ldi $0, 1($0)
+ fbne $f2, $End
+
+ LD $f17, 7 * SIZE(XX)
+ fcmpeq $f0, $f23, $f27
+ ldi $0, 1($0)
+ fbne $f3, $End
+
+ addl XX, 8*SIZE, XX
+ fcmpeq $f0, $f24, $f28
+ ldi $0, 1($0)
+ fbne $f4, $End
+
+ fcmpeq $f0, $f25, $f29
+ ldi $0, 1($0)
+ nop
+ fbne $f5, $End
+
+ ldi $0, 1($0)
+ fabs $f10, $f18
+ nop
+ fbne $f26, $End
+
+ ldi $0, 1($0)
+ fabs $f11, $f19
+ nop
+ fbne $f27, $End
+
+ ldi $0, 1($0)
+ fabs $f12, $f20
+ nop
+ fbne $f28, $End
+
+ ldi $0, 1($0)
+ fabs $f13, $f21
+ fbne $f29, $End
+ bgt $1, $Continuous_FindIndex_Loop
+ .align 4
+
+$Continuous_FindIndex_LoopEnd:
+ fabs $f14, $f22
+ fcmpeq $f0, $f18, $f2
+ fabs $f15, $f23
+ fcmpeq $f0, $f19, $f3
+
+ fabs $f16, $f24
+ fcmpeq $f0, $f20, $f4
+ fabs $f17, $f25
+ fcmpeq $f0, $f21, $f5
+
+ fcmpeq $f0, $f22, $f26
+ ldi $0, 1($0)
+ unop
+ fbne $f2, $End
+
+ fcmpeq $f0, $f23, $f27
+ ldi $0, 1($0)
+ unop
+ fbne $f3, $End
+
+ fcmpeq $f0, $f24, $f28
+ ldi $0, 1($0)
+ unop
+ fbne $f4, $End
+
+ fcmpeq $f0, $f25, $f29
+ ldi $0, 1($0)
+ unop
+ fbne $f5, $End
+
+ ldi $0, 1($0)
+ fbne $f26, $End
+ ldi $0, 1($0)
+ fbne $f27, $End
+ ldi $0, 1($0)
+ fbne $f28, $End
+ ldi $0, 1($0)
+ fbne $f29, $End
+ .align 4
+
+ jmp $L40
+ .align 4
+$Sub:
+ sra N, 3, $1
+ LD $f20, 0 * SIZE(X)
+ fabs $f20, $f0
+ ble $1, $L15
+ .align 4
+
+ fabs $f20, $f1
+ unop
+ addl X, INCX, X
+ unop
+
+ LD $f21, 0 * SIZE(X)
+ fabs $f20, $f2
+ addl X, INCX, X
+ unop
+
+ LD $f22, 0 * SIZE(X)
+ fabs $f20, $f3
+ addl X, INCX, X
+ unop
+
+ LD $f23, 0 * SIZE(X)
+ fabs $f20, $f4
+ addl X, INCX, X
+ unop
+
+ LD $f24, 0 * SIZE(X)
+ addl X, INCX, X
+ fabs $f20, $f5
+ unop
+
+ LD $f25, 0 * SIZE(X)
+ fabs $f20, $f6
+ addl X, INCX, X
+ unop
+
+ LD $f26, 0 * SIZE(X)
+ fabs $f20, $f28
+ addl X, INCX, X
+ ldi $1, -1($1)
+
+ LD $f27, 0 * SIZE(X)
+ unop
+ addl X, INCX, X
+ ble $1, $L13
+ .align 4
+
+$L12:
+ fselne $f16, $f12, $f4, $f4
+ unop
+ fabs $f20, $f29
+ fillcs 56 * SIZE(X)
+
+ fselne $f17, $f13, $f5, $f5
+ LD $f20, 0 * SIZE(X)
+ fabs $f21, $f30
+ addl X, INCX, X
+
+ fselne $f18, $f14, $f6, $f6
+ LD $f21, 0 * SIZE(X)
+ fabs $f22, $f10
+ addl X, INCX, X
+
+ fselne $f19, $f15, $f28, $f28
+ LD $f22, 0 * SIZE(X)
+ fabs $f23, $f11
+ addl X, INCX, X
+
+ fabs $f24, $f12
+ LD $f23, 0 * SIZE(X)
+ CMPLT($f0, $f29), $f16
+ addl X, INCX, X
+
+ fabs $f25, $f13
+ LD $f24, 0 * SIZE(X)
+ CMPLT($f1, $f30), $f17
+ addl X, INCX, X
+
+ fabs $f26, $f14
+ LD $f25, 0 * SIZE(X)
+ CMPLT($f2, $f10), $f18
+ addl X, INCX, X
+
+ fabs $f27, $f15
+ LD $f26, 0 * SIZE(X)
+ CMPLT($f3, $f11), $f19
+ addl X, INCX, X
+
+ fselne $f16, $f29, $f0, $f0
+ LD $f27, 0 * SIZE(X)
+ CMPLT($f4, $f12), $f16
+ addl X, INCX, X
+
+ fselne $f17, $f30, $f1, $f1
+ unop
+ CMPLT($f5, $f13), $f17
+ ldi $1, -1($1) # i --
+
+ fselne $f18, $f10, $f2, $f2
+ unop
+ CMPLT($f6, $f14), $f18
+ unop
+
+ fselne $f19, $f11, $f3, $f3
+ unop
+ CMPLT($f28, $f15), $f19
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ fselne $f16, $f12, $f4, $f4
+ fabs $f20, $f29
+ fselne $f17, $f13, $f5, $f5
+ fabs $f21, $f30
+
+ fselne $f18, $f14, $f6, $f6
+ fabs $f22, $f10
+ fselne $f19, $f15, $f28, $f28
+ fabs $f23, $f11
+
+ fabs $f24, $f12
+ CMPLT($f0, $f29), $f16
+ fabs $f25, $f13
+ CMPLT($f1, $f30), $f17
+
+ fabs $f26, $f14
+ CMPLT($f2, $f10), $f18
+ fabs $f27, $f15
+ CMPLT($f3, $f11), $f19
+
+ fselne $f16, $f29, $f0, $f0
+ CMPLT($f4, $f12), $f16
+ fselne $f17, $f30, $f1, $f1
+ CMPLT($f5, $f13), $f17
+
+ fselne $f18, $f10, $f2, $f2
+ CMPLT($f6, $f14), $f18
+ fselne $f19, $f11, $f3, $f3
+ CMPLT($f28, $f15), $f19
+
+ fselne $f16, $f12, $f4, $f4
+ CMPLT($f0, $f1), $f16
+ fselne $f17, $f13, $f5, $f5
+ CMPLT($f2, $f3), $f17
+
+ fselne $f18, $f14, $f6, $f6
+ CMPLT($f4, $f5), $f18
+ fselne $f19, $f15, $f28, $f28
+ CMPLT($f6, $f28), $f19
+
+ fselne $f16, $f1, $f0, $f0
+ fselne $f17, $f3, $f2, $f2
+ fselne $f18, $f5, $f4, $f4
+ fselne $f19, $f28, $f6, $f6
+
+ CMPLT($f0, $f2), $f16
+ CMPLT($f4, $f6), $f17
+
+ fselne $f16, $f2, $f0, $f0
+ fselne $f17, $f6, $f4, $f4
+
+ CMPLT($f0, $f4), $f16
+ fselne $f16, $f4, $f0, $f0
+ .align 4
+
+$L15:
+ and N, 7, $1
+ unop
+ unop
+ ble $1, $L20
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ CMPLT($f0, $f29), $f16
+ fselne $f16, $f29, $f0, $f0
+
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+/*
+ find the index
+*/
+$L20:
+ sra N, 3, $1
+ ble $1, $L40
+ .align 4
+
+ LD $f10, 0 * SIZE(XX)
+ addl XX, INCX, XX
+ LD $f11, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ addl XX, INCX, XX
+ LD $f13, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ addl XX, INCX, XX
+ LD $f15, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ addl XX, INCX, XX
+ LD $f17, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+ fabs $f12, $f20
+ fabs $f13, $f21
+
+ ldi $1, -1($1)
+ ble $1, $L23
+ .align 4
+
+$L22:
+ LD $f10, 0 * SIZE(XX)
+ fabs $f14, $f22
+ addl XX, INCX, XX
+ fcmpeq $f0, $f18, $f2
+
+ LD $f11, 0 * SIZE(XX)
+ fabs $f15, $f23
+ addl XX, INCX, XX
+ fcmpeq $f0, $f19, $f3
+
+ LD $f12, 0 * SIZE(XX)
+ fabs $f16, $f24
+ addl XX, INCX, XX
+ fcmpeq $f0, $f20, $f4
+
+ LD $f13, 0 * SIZE(XX)
+ fabs $f17, $f25
+ addl XX, INCX, XX
+ fcmpeq $f0, $f21, $f5
+
+ LD $f14, 0 * SIZE(XX)
+ ldi $1, -1($1) # i --
+ fcmpeq $f0, $f22, $f26
+ addl XX, INCX, XX
+
+ ldi $0, 1($0)
+ fbne $f2, $End
+
+ LD $f15, 0 * SIZE(XX)
+ fcmpeq $f0, $f23, $f27
+ ldi $0, 1($0)
+ fbne $f3, $End
+
+ addl XX, INCX, XX
+ fcmpeq $f0, $f24, $f28
+ ldi $0, 1($0)
+ fbne $f4, $End
+
+ LD $f16, 0 * SIZE(XX)
+ fcmpeq $f0, $f25, $f29
+ ldi $0, 1($0)
+ fbne $f5, $End
+
+ addl XX, INCX, XX
+ ldi $0, 1($0)
+ fabs $f10, $f18
+ fbne $f26, $End
+
+ LD $f17, 0 * SIZE(XX)
+ ldi $0, 1($0)
+ fabs $f11, $f19
+ fbne $f27, $End
+
+ addl XX, INCX, XX
+ ldi $0, 1($0)
+ fabs $f12, $f20
+ fbne $f28, $End
+
+ ldi $0, 1($0)
+ fabs $f13, $f21
+ fbne $f29, $End
+ bgt $1, $L22
+ .align 4
+
+$L23:
+ fabs $f14, $f22
+ fcmpeq $f0, $f18, $f2
+ fabs $f15, $f23
+ fcmpeq $f0, $f19, $f3
+
+ fabs $f16, $f24
+ fcmpeq $f0, $f20, $f4
+ fabs $f17, $f25
+ fcmpeq $f0, $f21, $f5
+
+ fcmpeq $f0, $f22, $f26
+ ldi $0, 1($0)
+ unop
+ fbne $f2, $End
+
+ fcmpeq $f0, $f23, $f27
+ ldi $0, 1($0)
+ unop
+ fbne $f3, $End
+
+ fcmpeq $f0, $f24, $f28
+ ldi $0, 1($0)
+ unop
+ fbne $f4, $End
+
+ fcmpeq $f0, $f25, $f29
+ ldi $0, 1($0)
+ unop
+ fbne $f5, $End
+
+ ldi $0, 1($0)
+ fbne $f26, $End
+ ldi $0, 1($0)
+ fbne $f27, $End
+ ldi $0, 1($0)
+ fbne $f28, $End
+ ldi $0, 1($0)
+ fbne $f29, $End
+ .align 4
+
+$L40:
+ LD $f20, 0 * SIZE(XX)
+ addl XX, INCX, XX
+
+ fabs $f20, $f25
+ fcmpeq $f0, $f25, $f29
+
+ ldi $0, 1($0)
+ fbne $f29, $End
+ br $31, $L40
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/imax.S b/kernel/sw_64/imax.S
new file mode 100644
index 0000000..b0cf5c8
--- /dev/null
+++ b/kernel/sw_64/imax.S
@@ -0,0 +1,351 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#ifndef USE_MIN
+#define CMPLT(a, b) cmptlt a, b
+#else
+#define CMPLT(a, b) cmptlt b, a
+#endif
+
+#define STACKSIZE 8 * 8
+
+ PROLOGUE
+ PROFCODE
+
+ clr $0
+ mov X, XX
+ .align 4
+
+ cmplt $31, N, $2
+ cmplt $31, INCX, $3
+ SXADDQ INCX, $31, INCX
+ and $2, $3, $2
+
+ sra N, 3, $1
+ fclr $f0
+ unop
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ LD $f0, 0 * SIZE(X)
+ unop
+ unop
+ ble $1, $L15
+ .align 4
+
+ fmov $f0, $f1
+ addq X, INCX, X
+ fmov $f0, $f10
+ lda $1, -1($1)
+
+ LD $f21, 0 * SIZE(X)
+ fmov $f0, $f11
+ addq X, INCX, X
+ fmov $f0, $f12
+
+ LD $f22, 0 * SIZE(X)
+ fmov $f0, $f13
+ addq X, INCX, X
+ fmov $f0, $f14
+
+ LD $f23, 0 * SIZE(X)
+ fmov $f0, $f15
+ addq X, INCX, X
+ fmov $f0, $f20
+
+ LD $f24, 0 * SIZE(X)
+ addq X, INCX, X
+ LD $f25, 0 * SIZE(X)
+ addq X, INCX, X
+ LD $f26, 0 * SIZE(X)
+ addq X, INCX, X
+ LD $f27, 0 * SIZE(X)
+ addq X, INCX, X
+
+ CMPLT($f0, $f20), $f16
+ CMPLT($f1, $f21), $f17
+ CMPLT($f10, $f22), $f18
+ CMPLT($f11, $f23), $f19
+
+ ble $1, $L13
+ .align 4
+
+$L12:
+ fcmovne $f16, $f20, $f0
+ LD $f20, 0 * SIZE(X)
+ CMPLT($f12, $f24), $f16
+ addq X, INCX, X
+
+ fcmovne $f17, $f21, $f1
+ LD $f21, 0 * SIZE(X)
+ CMPLT($f13, $f25), $f17
+ addq X, INCX, X
+
+ fcmovne $f18, $f22, $f10
+ LD $f22, 0 * SIZE(X)
+ CMPLT($f14, $f26), $f18
+ addq X, INCX, X
+
+ fcmovne $f19, $f23, $f11
+ LD $f23, 0 * SIZE(X)
+ CMPLT($f15, $f27), $f19
+ addq X, INCX, X
+
+ fcmovne $f16, $f24, $f12
+ LD $f24, 0 * SIZE(X)
+ CMPLT($f0, $f20), $f16
+ addq X, INCX, X
+
+ fcmovne $f17, $f25, $f13
+ LD $f25, 0 * SIZE(X)
+ CMPLT($f1, $f21), $f17
+ addq X, INCX, X
+
+ fcmovne $f18, $f26, $f14
+ LD $f26, 0 * SIZE(X)
+ CMPLT($f10, $f22), $f18
+ addq X, INCX, X
+
+ fcmovne $f19, $f27, $f15
+ LD $f27, 0 * SIZE(X)
+ CMPLT($f11, $f23), $f19
+ lda $1, -1($1) # i --
+
+ addq X, INCX, X
+ unop
+ unop
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ fcmovne $f16, $f20, $f0
+ CMPLT($f12, $f24), $f16
+
+ fcmovne $f17, $f21, $f1
+ CMPLT($f13, $f25), $f17
+
+ fcmovne $f18, $f22, $f10
+ CMPLT($f14, $f26), $f18
+
+ fcmovne $f19, $f23, $f11
+ CMPLT($f15, $f27), $f19
+
+ fcmovne $f16, $f24, $f12
+ CMPLT($f0, $f1), $f16
+ fcmovne $f17, $f25, $f13
+ CMPLT($f10, $f11), $f17
+
+ fcmovne $f18, $f26, $f14
+ CMPLT($f12, $f13), $f18
+ fcmovne $f19, $f27, $f15
+ CMPLT($f14, $f15), $f19
+
+ fcmovne $f16, $f1, $f0
+ fcmovne $f17, $f11, $f10
+ fcmovne $f18, $f13, $f12
+ fcmovne $f19, $f15, $f14
+
+ CMPLT($f0, $f10), $f16
+ CMPLT($f12, $f14), $f17
+
+ fcmovne $f16, $f10, $f0
+ fcmovne $f17, $f14, $f12
+
+ CMPLT($f0, $f12), $f16
+ fcmovne $f16, $f12, $f0
+ .align 4
+
+$L15:
+ and N, 7, $1
+ unop
+ unop
+ ble $1, $L20
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ addq X, INCX, X
+
+ CMPLT($f0, $f20), $f16
+ fcmovne $f16, $f20, $f0
+ lda $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$L20:
+ sra N, 3, $1
+ ble $1, $L40
+ .align 4
+
+ LD $f10, 0 * SIZE(XX)
+ addq XX, INCX, XX
+ LD $f11, 0 * SIZE(XX)
+ addq XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ addq XX, INCX, XX
+ LD $f13, 0 * SIZE(XX)
+ addq XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ addq XX, INCX, XX
+ LD $f15, 0 * SIZE(XX)
+ addq XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ addq XX, INCX, XX
+ LD $f17, 0 * SIZE(XX)
+ addq XX, INCX, XX
+
+ cmpteq $f0, $f10, $f20
+ cmpteq $f0, $f11, $f21
+ cmpteq $f0, $f12, $f22
+ cmpteq $f0, $f13, $f23
+
+ lda $1, -1($1)
+ ble $1, $L23
+ .align 4
+
+$L22:
+ LD $f10, 0 * SIZE(XX)
+ cmpteq $f0, $f14, $f24
+ lda $0, 1($0)
+ addq XX, INCX, XX
+ fbne $f20, $End
+
+ LD $f11, 0 * SIZE(XX)
+ cmpteq $f0, $f15, $f25
+ lda $0, 1($0)
+ addq XX, INCX, XX
+ fbne $f21, $End
+
+ LD $f12, 0 * SIZE(XX)
+ cmpteq $f0, $f16, $f26
+ lda $0, 1($0)
+ addq XX, INCX, XX
+ fbne $f22, $End
+
+ LD $f13, 0 * SIZE(XX)
+ cmpteq $f0, $f17, $f27
+ lda $0, 1($0)
+ addq XX, INCX, XX
+ fbne $f23, $End
+
+ LD $f14, 0 * SIZE(XX)
+ cmpteq $f0, $f10, $f20
+ lda $0, 1($0)
+ addq XX, INCX, XX
+ fbne $f24, $End
+
+ LD $f15, 0 * SIZE(XX)
+ cmpteq $f0, $f11, $f21
+ lda $0, 1($0)
+ addq XX, INCX, XX
+ fbne $f25, $End
+
+ LD $f16, 0 * SIZE(XX)
+ lda $1, -1($1) # i --
+ cmpteq $f0, $f12, $f22
+ lda $0, 1($0)
+ addq XX, INCX, XX
+ fbne $f26, $End
+
+ LD $f17, 0 * SIZE(XX)
+ cmpteq $f0, $f13, $f23
+ lda $0, 1($0)
+ addq XX, INCX, XX
+ fbne $f27, $End
+
+ bgt $1, $L22
+ .align 4
+
+$L23:
+ lda $0, 1($0)
+ cmpteq $f0, $f14, $f24
+ unop
+ fbne $f20, $End
+
+ lda $0, 1($0)
+ cmpteq $f0, $f15, $f25
+ unop
+ fbne $f21, $End
+
+ lda $0, 1($0)
+ cmpteq $f0, $f16, $f26
+ unop
+ fbne $f22, $End
+
+ lda $0, 1($0)
+ cmpteq $f0, $f17, $f27
+ unop
+ fbne $f23, $End
+
+ lda $0, 1($0)
+ fbne $f24, $End
+ lda $0, 1($0)
+ fbne $f25, $End
+ lda $0, 1($0)
+ fbne $f26, $End
+ lda $0, 1($0)
+ fbne $f27, $End
+ .align 4
+
+$L40:
+ LD $f20, 0 * SIZE(XX)
+ addq XX, INCX, XX
+
+ cmpteq $f0, $f20, $f29
+
+ lda $0, 1($0)
+ fbne $f29, $End
+ br $31, $L40
+ .align 4
+
+$End:
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/imax.c b/kernel/sw_64/imax.c
new file mode 100644
index 0000000..5072dd1
--- /dev/null
+++ b/kernel/sw_64/imax.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/09/14 Saar
+* BLASTEST float : NoTest
+* BLASTEST double : NoTest
+* CTEST : NoTest
+* TEST : NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT maxf=0.0;
+ BLASLONG max=0;
+
+ if (n <= 0 || inc_x <= 0) return(max);
+
+ maxf=x[0];
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( x[ix] > maxf )
+ {
+ max = i;
+ maxf = x[ix];
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(max+1);
+}
+
+
diff --git a/kernel/sw_64/imin.c b/kernel/sw_64/imin.c
new file mode 100644
index 0000000..ffc6522
--- /dev/null
+++ b/kernel/sw_64/imin.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/08/19 Saar
+* BLASTEST float
+* BLASTEST double
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0;
+ FLOAT minf=0.0;
+ BLASLONG min=0;
+
+ if (n <= 0 || inc_x <= 0) return(min);
+
+ minf=x[0];
+ ix += inc_x;
+ i++;
+
+ while(i < n)
+ {
+ if( x[ix] < minf )
+ {
+ min = i;
+ minf = x[ix];
+ }
+ ix += inc_x;
+ i++;
+ }
+ return(min+1);
+}
+
+
diff --git a/kernel/sw_64/izamax.S b/kernel/sw_64/izamax.S
new file mode 100644
index 0000000..5ccc60e
--- /dev/null
+++ b/kernel/sw_64/izamax.S
@@ -0,0 +1,429 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#endif
+
+#define STACKSIZE 8 * 8
+
+ PROLOGUE
+ PROFCODE
+
+ ldi $sp, -STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fclr $f16
+ cmplt $31, N, $2
+ unop
+
+ fstd $f3, 8($sp)
+ fclr $f17
+ cmplt $31, INCX, $3
+ unop
+
+ fstd $f4, 16($sp)
+ fclr $f18
+ SXADDQ INCX, $31, INCX
+ unop
+
+ fstd $f5, 24($sp)
+ fclr $f19
+ and $2, $3, $2
+ clr $0
+
+ fstd $f6, 32($sp)
+ mov X, XX
+
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ fclr $f0
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ sra N, 2, $1
+ addl INCX, INCX, INCX
+
+ fabs $f20, $f20
+ fabs $f21, $f21
+ faddd $f20, $f21, $f0
+ ble $1, $L15
+ .align 4
+
+ ldi $1, -1($1)
+ unop
+ addl X, INCX, X
+ unop
+
+ LD $f22, 0 * SIZE(X)
+ fmov $f0, $f1
+ LD $f23, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ fmov $f0, $f2
+ LD $f25, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ fmov $f0, $f3
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f8
+ fabs $f21, $f9
+ fabs $f22, $f10
+ fabs $f23, $f11
+
+ fabs $f24, $f12
+ fabs $f25, $f13
+ fabs $f26, $f14
+ fabs $f27, $f15
+
+ ble $1, $L14
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ ldi $1, -1($1)
+ addl X, INCX, X
+
+ LD $f22, 0 * SIZE(X)
+ LD $f23, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ LD $f25, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ble $1, $L13
+ .align 4
+
+$L12:
+ faddd $f8, $f9, $f16
+ unop
+ fabs $f20, $f8
+ fillcs 64 * SIZE(X)
+
+ faddd $f10, $f11, $f17
+ unop
+ fabs $f21, $f9
+ LD $f20, 0 * SIZE(X)
+
+ faddd $f12, $f13, $f18
+ LD $f21, 1 * SIZE(X)
+ fabs $f22, $f10
+ addl X, INCX, X
+
+ faddd $f14, $f15, $f19
+ LD $f22, 0 * SIZE(X)
+ fabs $f23, $f11
+ unop
+
+ CMPLT($f0, $f16), $f4
+ LD $f23, 1 * SIZE(X)
+ fabs $f24, $f12
+ addl X, INCX, X
+
+ CMPLT($f1, $f17), $f5
+ LD $f24, 0 * SIZE(X)
+ fabs $f25, $f13
+ unop
+
+ CMPLT($f2, $f18), $f6
+ LD $f25, 1 * SIZE(X)
+ fabs $f26, $f14
+ addl X, INCX, X
+
+ CMPLT($f3, $f19), $f7
+ LD $f26, 0 * SIZE(X)
+ fabs $f27, $f15
+ unop
+
+ fselne $f4, $f16, $f0, $f0
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ldi $1, -1($1) # i --
+
+ fselne $f5, $f17, $f1, $f1
+ fselne $f6, $f18, $f2, $f2
+ fselne $f7, $f19, $f3, $f3
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ faddd $f8, $f9, $f16
+ fabs $f20, $f8
+
+ faddd $f10, $f11, $f17
+ fabs $f21, $f9
+
+ faddd $f12, $f13, $f18
+ fabs $f22, $f10
+
+ faddd $f14, $f15, $f19
+ fabs $f23, $f11
+
+ CMPLT($f0, $f16), $f4
+ fabs $f24, $f12
+
+ CMPLT($f1, $f17), $f5
+ fabs $f25, $f13
+
+ CMPLT($f2, $f18), $f6
+ fabs $f26, $f14
+ CMPLT($f3, $f19), $f7
+ fabs $f27, $f15
+
+ fselne $f4, $f16, $f0, $f0
+ fselne $f5, $f17, $f1, $f1
+ fselne $f6, $f18, $f2, $f2
+ fselne $f7, $f19, $f3, $f3
+ .align 4
+
+$L14:
+ faddd $f8, $f9, $f16
+ faddd $f10, $f11, $f17
+ faddd $f12, $f13, $f18
+ faddd $f14, $f15, $f19
+
+ CMPLT($f0, $f16), $f4
+ CMPLT($f1, $f17), $f5
+ CMPLT($f2, $f18), $f6
+ CMPLT($f3, $f19), $f7
+
+ fselne $f4, $f16, $f0, $f0
+ fselne $f5, $f17, $f1, $f1
+ fselne $f6, $f18, $f2, $f2
+ fselne $f7, $f19, $f3, $f3
+
+ CMPLT($f0, $f1), $f16
+ CMPLT($f2, $f3), $f17
+
+ fselne $f16, $f1, $f0, $f0
+ fselne $f17, $f3, $f2, $f2
+
+ CMPLT($f0, $f2), $f16
+ fselne $f16, $f2, $f0, $f0
+ .align 4
+
+$L15:
+ and N, 3, $1
+ unop
+ unop
+ ble $1, $L20
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ fabs $f21, $f30
+ faddd $f29, $f30, $f24
+ fmov $f24,$f29
+
+ CMPLT($f0, $f29), $f16
+ fselne $f16, $f29, $f0, $f0
+
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$L20:
+ sra N, 2, $1
+ ble $1, $L40
+ .align 4
+
+ LD $f10, 0 * SIZE(XX)
+ LD $f11, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ LD $f13, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ LD $f15, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ LD $f17, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+ fabs $f12, $f20
+ fabs $f13, $f21
+
+ ldi $1, -1($1)
+ ble $1, $L23
+ .align 4
+
+$L22:
+ LD $f10, 0 * SIZE(XX)
+ fabs $f14, $f22
+ LD $f11, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ fabs $f15, $f23
+ LD $f13, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ fabs $f16, $f24
+ LD $f15, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ fabs $f17, $f25
+ LD $f17, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ faddd $f18, $f19, $f4
+ faddd $f20, $f21, $f5
+ faddd $f22, $f23, $f6
+ faddd $f24, $f25, $f7
+
+ fcmpeq $f0, $f4, $f26
+ fcmpeq $f0, $f5, $f27
+ fcmpeq $f0, $f6, $f28
+ fcmpeq $f0, $f7, $f29
+
+ fabs $f10, $f18
+ ldi $0, 1($0)
+ ldi $1, -1($1) # i --
+ fbne $f26, $End
+
+ fabs $f11, $f19
+ ldi $0, 1($0)
+ unop
+ fbne $f27, $End
+
+ fabs $f12, $f20
+ ldi $0, 1($0)
+ unop
+ fbne $f28, $End
+
+ fabs $f13, $f21
+ ldi $0, 1($0)
+ fbne $f29, $End
+ bgt $1, $L22
+ .align 4
+
+$L23:
+ fabs $f14, $f22
+ fabs $f15, $f23
+ fabs $f16, $f24
+ fabs $f17, $f25
+
+ faddd $f18, $f19, $f4
+ faddd $f20, $f21, $f5
+ faddd $f22, $f23, $f6
+ faddd $f24, $f25, $f7
+
+ fcmpeq $f0, $f4, $f26
+ fcmpeq $f0, $f5, $f27
+ fcmpeq $f0, $f6, $f28
+ fcmpeq $f0, $f7, $f29
+
+ ldi $0, 1($0)
+ fbne $f26, $End
+ ldi $0, 1($0)
+ fbne $f27, $End
+ ldi $0, 1($0)
+ fbne $f28, $End
+ ldi $0, 1($0)
+ fbne $f29, $End
+ .align 4
+
+$L40:
+ LD $f10, 0 * SIZE(XX)
+ LD $f11, 1 * SIZE(XX)
+
+ addl XX, INCX, XX
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+
+ faddd $f18, $f19, $f2
+ fmov $f2,$f18
+ fcmpeq $f0, $f18, $f2
+
+ ldi $0, 1($0)
+ fbne $f2, $End
+ br $31, $L40
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/izamax.S.bak b/kernel/sw_64/izamax.S.bak
new file mode 100644
index 0000000..34e4c88
--- /dev/null
+++ b/kernel/sw_64/izamax.S.bak
@@ -0,0 +1,427 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#endif
+
+#define STACKSIZE 8 * 8
+
+ PROLOGUE
+ PROFCODE
+
+ ldi $sp, -STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fclr $f16
+ cmplt $31, N, $2
+ unop
+
+ fstd $f3, 8($sp)
+ fclr $f17
+ cmplt $31, INCX, $3
+ unop
+
+ fstd $f4, 16($sp)
+ fclr $f18
+ SXADDQ INCX, $31, INCX
+ unop
+
+ fstd $f5, 24($sp)
+ fclr $f19
+ and $2, $3, $2
+ clr $0
+
+ fstd $f6, 32($sp)
+ mov X, XX
+
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ fclr $f0
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ sra N, 2, $1
+ addl INCX, INCX, INCX
+
+ fabs $f20, $f20
+ fabs $f21, $f21
+ faddd $f20, $f21, $f0
+ ble $1, $L15
+ .align 4
+
+ ldi $1, -1($1)
+ unop
+ addl X, INCX, X
+ unop
+
+ LD $f22, 0 * SIZE(X)
+ fmov $f0, $f1
+ LD $f23, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ fmov $f0, $f2
+ LD $f25, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ fmov $f0, $f3
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f8
+ fabs $f21, $f9
+ fabs $f22, $f10
+ fabs $f23, $f11
+
+ fabs $f24, $f12
+ fabs $f25, $f13
+ fabs $f26, $f14
+ fabs $f27, $f15
+
+ ble $1, $L14
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ ldi $1, -1($1)
+ addl X, INCX, X
+
+ LD $f22, 0 * SIZE(X)
+ LD $f23, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ LD $f25, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ble $1, $L13
+ .align 4
+
+$L12:
+ faddd $f8, $f9, $f16
+ unop
+ fabs $f20, $f8
+ fillcs 64 * SIZE(X)
+
+ faddd $f10, $f11, $f17
+ unop
+ fabs $f21, $f9
+ LD $f20, 0 * SIZE(X)
+
+ faddd $f12, $f13, $f18
+ LD $f21, 1 * SIZE(X)
+ fabs $f22, $f10
+ addl X, INCX, X
+
+ faddd $f14, $f15, $f19
+ LD $f22, 0 * SIZE(X)
+ fabs $f23, $f11
+ unop
+
+ CMPLT($f0, $f16), $f4
+ LD $f23, 1 * SIZE(X)
+ fabs $f24, $f12
+ addl X, INCX, X
+
+ CMPLT($f1, $f17), $f5
+ LD $f24, 0 * SIZE(X)
+ fabs $f25, $f13
+ unop
+
+ CMPLT($f2, $f18), $f6
+ LD $f25, 1 * SIZE(X)
+ fabs $f26, $f14
+ addl X, INCX, X
+
+ CMPLT($f3, $f19), $f7
+ LD $f26, 0 * SIZE(X)
+ fabs $f27, $f15
+ unop
+
+fselne $f4,$f16,$f0, $f0
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ldi $1, -1($1) # i --
+
+fselne $f5,$f17,$f1, $f1
+fselne $f6,$f18,$f2, $f2
+fselne $f7,$f19,$f3, $f3
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ faddd $f8, $f9, $f16
+ fabs $f20, $f8
+
+ faddd $f10, $f11, $f17
+ fabs $f21, $f9
+
+ faddd $f12, $f13, $f18
+ fabs $f22, $f10
+
+ faddd $f14, $f15, $f19
+ fabs $f23, $f11
+
+ CMPLT($f0, $f16), $f4
+ fabs $f24, $f12
+
+ CMPLT($f1, $f17), $f5
+ fabs $f25, $f13
+
+ CMPLT($f2, $f18), $f6
+ fabs $f26, $f14
+ CMPLT($f3, $f19), $f7
+ fabs $f27, $f15
+
+fselne $f4,$f16,$f0, $f0
+fselne $f5,$f17,$f1, $f1
+fselne $f6,$f18,$f2, $f2
+fselne $f7,$f19,$f3, $f3
+ .align 4
+
+$L14:
+ faddd $f8, $f9, $f16
+ faddd $f10, $f11, $f17
+ faddd $f12, $f13, $f18
+ faddd $f14, $f15, $f19
+
+ CMPLT($f0, $f16), $f4
+ CMPLT($f1, $f17), $f5
+ CMPLT($f2, $f18), $f6
+ CMPLT($f3, $f19), $f7
+
+fselne $f4,$f16,$f0, $f0
+fselne $f5,$f17,$f1, $f1
+fselne $f6,$f18,$f2, $f2
+fselne $f7,$f19,$f3, $f3
+
+ CMPLT($f0, $f1), $f16
+ CMPLT($f2, $f3), $f17
+
+fselne $f16,$f1,$f0, $f0
+fselne $f17,$f3,$f2, $f2
+
+ CMPLT($f0, $f2), $f16
+fselne $f16,$f2,$f0, $f0
+ .align 4
+
+$L15:
+ and N, 3, $1
+ unop
+ unop
+ ble $1, $L20
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ fabs $f21, $f30
+ faddd $f29, $f30, $f29
+
+ CMPLT($f0, $f29), $f16
+fselne $f16,$f29,$f0, $f0
+
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$L20:
+ sra N, 2, $1
+ ble $1, $L40
+ .align 4
+
+ LD $f10, 0 * SIZE(XX)
+ LD $f11, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ LD $f13, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ LD $f15, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ LD $f17, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+ fabs $f12, $f20
+ fabs $f13, $f21
+
+ ldi $1, -1($1)
+ ble $1, $L23
+ .align 4
+
+$L22:
+ LD $f10, 0 * SIZE(XX)
+ fabs $f14, $f22
+ LD $f11, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ fabs $f15, $f23
+ LD $f13, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ fabs $f16, $f24
+ LD $f15, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ fabs $f17, $f25
+ LD $f17, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ faddd $f18, $f19, $f4
+ faddd $f20, $f21, $f5
+ faddd $f22, $f23, $f6
+ faddd $f24, $f25, $f7
+
+ fcmpeq $f0, $f4, $f26
+ fcmpeq $f0, $f5, $f27
+ fcmpeq $f0, $f6, $f28
+ fcmpeq $f0, $f7, $f29
+
+ fabs $f10, $f18
+ ldi $0, 1($0)
+ ldi $1, -1($1) # i --
+ fbne $f26, $End
+
+ fabs $f11, $f19
+ ldi $0, 1($0)
+ unop
+ fbne $f27, $End
+
+ fabs $f12, $f20
+ ldi $0, 1($0)
+ unop
+ fbne $f28, $End
+
+ fabs $f13, $f21
+ ldi $0, 1($0)
+ fbne $f29, $End
+ bgt $1, $L22
+ .align 4
+
+$L23:
+ fabs $f14, $f22
+ fabs $f15, $f23
+ fabs $f16, $f24
+ fabs $f17, $f25
+
+ faddd $f18, $f19, $f4
+ faddd $f20, $f21, $f5
+ faddd $f22, $f23, $f6
+ faddd $f24, $f25, $f7
+
+ fcmpeq $f0, $f4, $f26
+ fcmpeq $f0, $f5, $f27
+ fcmpeq $f0, $f6, $f28
+ fcmpeq $f0, $f7, $f29
+
+ ldi $0, 1($0)
+ fbne $f26, $End
+ ldi $0, 1($0)
+ fbne $f27, $End
+ ldi $0, 1($0)
+ fbne $f28, $End
+ ldi $0, 1($0)
+ fbne $f29, $End
+ .align 4
+
+$L40:
+ LD $f10, 0 * SIZE(XX)
+ LD $f11, 1 * SIZE(XX)
+
+ addl XX, INCX, XX
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+
+ faddd $f18, $f19, $f18
+ fcmpeq $f0, $f18, $f2
+
+ ldi $0, 1($0)
+ fbne $f2, $End
+ br $31, $L40
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/izamax_simd.S b/kernel/sw_64/izamax_simd.S
new file mode 100644
index 0000000..8b00f60
--- /dev/null
+++ b/kernel/sw_64/izamax_simd.S
@@ -0,0 +1,609 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 96
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $2
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#define VCMPLT(a, b) vfcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#define VCMPLT(a, b) vfcmplt b, a
+#endif
+
+#define STACKSIZE 8 * 8
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fclr $f16
+ cmplt $31, N, $2
+ unop
+
+ fstd $f3, 8($sp)
+ fclr $f17
+ cmplt $31, INCX, $3
+ unop
+
+ fstd $f4, 16($sp)
+ fclr $f18
+ SXADDQ INCX, $31, INCX
+ unop
+
+ fstd $f5, 24($sp)
+ fclr $f19
+ and $2, $3, $2
+ clr $0
+
+ fstd $f6, 32($sp)
+ mov X, XX
+
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ fclr $f0
+ cmpeq INCX, SIZE, $3
+ and X, (VEC_LEN*SIZE-1), $4 # test the address of X (aligment)
+ beq $2, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ bic $3, $4, $3
+ nop
+ nop
+ beq $3, $Sub
+ .align 4
+
+$Align_Access:
+/*
+ Unloop 8*2=16 reals
+*/
+#ifdef USE_MIN
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ fabs $f20, $f20
+ fabs $f21, $f21
+ ADD $f20, $f21, $f0 # init temp min result value
+#endif
+ sra N, 3, I
+ and N, 7, $3
+ addl INCX, INCX, INCX
+ ble I, $Remain
+ .align 4
+/*
+ Init max or min value
+*/
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ fabs $f20, $f20
+ fabs $f21, $f21
+
+ ADD $f20, $f21, $f4
+ nop
+ vcpyf $f4, $f0
+ vcpyf $f4, $f1
+
+
+ VLD $f22, 0*VEC_LEN*SIZE(X)
+ VLD $f23, 1*VEC_LEN*SIZE(X)
+ VLD $f24, 2*VEC_LEN*SIZE(X)
+ VLD $f25, 3*VEC_LEN*SIZE(X)
+
+ /*vfabs*/
+ vcpys $f31, $f22, $f10
+ subl I, 1, I
+ vcpys $f31, $f23, $f11
+ addl X, 16*SIZE, X
+
+ vcpys $f31, $f24, $f12
+ nop
+ vcpys $f31, $f25, $f13
+ ble I, $MainLoopEnd
+ .align 4
+
+$MainLoop:
+ vextf $f10, 1, $f4
+ VLD $f22, 0*VEC_LEN*SIZE(X)
+ vextf $f10, 3, $f5
+ VLD $f23, 1*VEC_LEN*SIZE(X)
+
+ vextf $f11, 0, $f6
+ VLD $f24, 2*VEC_LEN*SIZE(X)
+ vextf $f11, 2, $f7
+ VLD $f25, 3*VEC_LEN*SIZE(X)
+
+ vextf $f12, 1, $f14
+ vextf $f12, 3, $f15
+ vextf $f13, 0, $f16
+ vextf $f13, 2, $f17
+
+ vinsf $f4, $f11, 0, $f11
+ vinsf $f6, $f10, 1, $f10
+ vinsf $f14, $f13, 0, $f13
+ vinsf $f16, $f12, 1, $f12
+
+ vinsf $f5, $f11, 2, $f11
+ vinsf $f7, $f10, 3, $f10
+ vinsf $f15, $f13, 2, $f13
+ vinsf $f17, $f12, 3, $f12
+
+ VADD $f10, $f11, $f2
+ addl X, 16*SIZE, X
+ VADD $f12, $f13, $f3
+ subl I, 1, I
+
+ vcpys $f31, $f22, $f10
+ vcpys $f31, $f23, $f11
+ VCMPLT($f0, $f2), $f18
+ VCMPLT($f1, $f3), $f19
+
+ vcpys $f31, $f24, $f12
+ fillcs PREFETCHSIZE * SIZE(X)
+ vcpys $f31, $f25, $f13
+ nop
+
+ vfseleq $f18, $f0, $f2, $f0
+ vfseleq $f19, $f1, $f3, $f1
+ nop
+ bgt I, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+/*spilt the complex vector to real vector($f10,$f12) and image vector ($f11,$f13)*/
+ vextf $f10, 1, $f4
+ vextf $f10, 3, $f5
+ vextf $f11, 0, $f6
+ vextf $f11, 2, $f7
+
+ vextf $f12, 1, $f14
+ vextf $f12, 3, $f15
+ vextf $f13, 0, $f16
+ vextf $f13, 2, $f17
+
+ vinsf $f4, $f11, 0, $f11
+ vinsf $f6, $f10, 1, $f10
+ vinsf $f14, $f13, 0, $f13
+ vinsf $f16, $f12, 1, $f12
+
+ vinsf $f5, $f11, 2, $f11
+ vinsf $f7, $f10, 3, $f10
+ vinsf $f15, $f13, 2, $f13
+ vinsf $f17, $f12, 3, $f12
+
+ VADD $f10, $f11, $f2
+ VADD $f12, $f13, $f3
+ VCMPLT($f0, $f2), $f18
+ VCMPLT($f1, $f3), $f19
+
+ vfseleq $f18, $f0, $f2, $f0
+ vfseleq $f19, $f1, $f3, $f1
+/*find the max or min between f0 and f1*/
+ VCMPLT($f0, $f1), $f18
+ vfseleq $f18, $f0, $f1, $f0
+
+
+ vextf $f0, 1, $f22
+ vextf $f0, 2, $f23
+ vextf $f0, 3, $f24
+ CMPLT($f0, $f22), $f16
+
+ CMPLT($f23, $f24), $f17
+ fseleq $f16, $f0, $f22, $f0
+ fseleq $f17, $f23, $f24, $f23
+ CMPLT($f0, $f23), $f18
+
+ fseleq $f18, $f0, $f23, $f0
+ nop
+ .align 4
+$Remain:
+ ble $3, $Continuous_FindIndex
+ .align 4
+$RemainLoop:
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ unop
+ addl X, 2*SIZE, X
+
+ fabs $f20, $f29
+ fabs $f21, $f30
+ ADD $f29, $f30, $f29
+
+ CMPLT($f0, $f29), $f16
+ fselne $f16,$f29,$f0, $f0
+
+ subl $3, 1, $3
+ bgt $3, $RemainLoop
+ .align 4
+
+ /*find index*/
+$Continuous_FindIndex:
+
+ jmp $L20
+
+$Sub:
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ sra N, 2, $1
+ addl INCX, INCX, INCX
+
+ fabs $f20, $f20
+ fabs $f21, $f21
+ ADD $f20, $f21, $f0
+ ble $1, $L15
+ .align 4
+
+ ldi $1, -1($1)
+ unop
+ addl X, INCX, X
+ unop
+
+ LD $f22, 0 * SIZE(X)
+ fmov $f0, $f1
+ LD $f23, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ fmov $f0, $f2
+ LD $f25, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ fmov $f0, $f3
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f8
+ fabs $f21, $f9
+ fabs $f22, $f10
+ fabs $f23, $f11
+
+ fabs $f24, $f12
+ fabs $f25, $f13
+ fabs $f26, $f14
+ fabs $f27, $f15
+
+ ble $1, $L14
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ ldi $1, -1($1)
+ addl X, INCX, X
+
+ LD $f22, 0 * SIZE(X)
+ LD $f23, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ LD $f25, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ble $1, $L13
+ .align 4
+
+$L12:
+ ADD $f8, $f9, $f16
+ fillcs PREFETCHSIZE * SIZE(X)
+ fabs $f20, $f8
+ fillcs 64 * SIZE(X)
+
+ ADD $f10, $f11, $f17
+ unop
+ fabs $f21, $f9
+ LD $f20, 0 * SIZE(X)
+
+ ADD $f12, $f13, $f18
+ LD $f21, 1 * SIZE(X)
+ fabs $f22, $f10
+ addl X, INCX, X
+
+ ADD $f14, $f15, $f19
+ LD $f22, 0 * SIZE(X)
+ fabs $f23, $f11
+ unop
+
+ CMPLT($f0, $f16), $f4
+ LD $f23, 1 * SIZE(X)
+ fabs $f24, $f12
+ addl X, INCX, X
+
+ CMPLT($f1, $f17), $f5
+ LD $f24, 0 * SIZE(X)
+ fabs $f25, $f13
+ unop
+
+ CMPLT($f2, $f18), $f6
+ LD $f25, 1 * SIZE(X)
+ fabs $f26, $f14
+ addl X, INCX, X
+
+ CMPLT($f3, $f19), $f7
+ LD $f26, 0 * SIZE(X)
+ fabs $f27, $f15
+ unop
+
+ fselne $f4,$f16,$f0, $f0
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ldi $1, -1($1) # i --
+
+ fselne $f5,$f17,$f1, $f1
+ fselne $f6,$f18,$f2, $f2
+ fselne $f7,$f19,$f3, $f3
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ ADD $f8, $f9, $f16
+ fabs $f20, $f8
+
+ ADD $f10, $f11, $f17
+ fabs $f21, $f9
+
+ ADD $f12, $f13, $f18
+ fabs $f22, $f10
+
+ ADD $f14, $f15, $f19
+ fabs $f23, $f11
+
+ CMPLT($f0, $f16), $f4
+ fabs $f24, $f12
+
+ CMPLT($f1, $f17), $f5
+ fabs $f25, $f13
+
+ CMPLT($f2, $f18), $f6
+ fabs $f26, $f14
+ CMPLT($f3, $f19), $f7
+ fabs $f27, $f15
+
+ fselne $f4,$f16,$f0, $f0
+ fselne $f5,$f17,$f1, $f1
+ fselne $f6,$f18,$f2, $f2
+ fselne $f7,$f19,$f3, $f3
+ .align 4
+
+$L14:
+ ADD $f8, $f9, $f16
+ ADD $f10, $f11, $f17
+ ADD $f12, $f13, $f18
+ ADD $f14, $f15, $f19
+
+ CMPLT($f0, $f16), $f4
+ CMPLT($f1, $f17), $f5
+ CMPLT($f2, $f18), $f6
+ CMPLT($f3, $f19), $f7
+
+ fselne $f4,$f16,$f0, $f0
+ fselne $f5,$f17,$f1, $f1
+ fselne $f6,$f18,$f2, $f2
+ fselne $f7,$f19,$f3, $f3
+
+ CMPLT($f0, $f1), $f16
+ CMPLT($f2, $f3), $f17
+
+ fselne $f16,$f1,$f0, $f0
+ fselne $f17,$f3,$f2, $f2
+
+ CMPLT($f0, $f2), $f16
+ fselne $f16,$f2,$f0, $f0
+ .align 4
+
+$L15:
+ and N, 3, $1
+ unop
+ unop
+ ble $1, $L20
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ fabs $f21, $f30
+ ADD $f29, $f30, $f29
+
+ CMPLT($f0, $f29), $f16
+ fselne $f16,$f29,$f0, $f0
+
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$L20:
+ sra N, 2, $1
+ ble $1, $L40
+ .align 4
+
+ LD $f10, 0 * SIZE(XX)
+ LD $f11, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ LD $f13, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ LD $f15, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ LD $f17, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+ fabs $f12, $f20
+ fabs $f13, $f21
+
+ ldi $1, -1($1)
+ ble $1, $L23
+ .align 4
+
+$L22:
+ LD $f10, 0 * SIZE(XX)
+ fabs $f14, $f22
+ LD $f11, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f12, 0 * SIZE(XX)
+ fabs $f15, $f23
+ LD $f13, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f14, 0 * SIZE(XX)
+ fabs $f16, $f24
+ LD $f15, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ LD $f16, 0 * SIZE(XX)
+ fabs $f17, $f25
+ LD $f17, 1 * SIZE(XX)
+ addl XX, INCX, XX
+
+ ADD $f18, $f19, $f4
+ ADD $f20, $f21, $f5
+ ADD $f22, $f23, $f6
+ ADD $f24, $f25, $f7
+
+ fcmpeq $f0, $f4, $f26
+ fcmpeq $f0, $f5, $f27
+ fcmpeq $f0, $f6, $f28
+ fcmpeq $f0, $f7, $f29
+
+ fabs $f10, $f18
+ ldi $0, 1($0)
+ ldi $1, -1($1) # i --
+ fbne $f26, $End
+
+ fabs $f11, $f19
+ ldi $0, 1($0)
+ unop
+ fbne $f27, $End
+
+ fabs $f12, $f20
+ ldi $0, 1($0)
+ fillcs PREFETCHSIZE * SIZE(X)
+ fbne $f28, $End
+
+ fabs $f13, $f21
+ ldi $0, 1($0)
+ fbne $f29, $End
+ bgt $1, $L22
+ .align 4
+
+$L23:
+ fabs $f14, $f22
+ fabs $f15, $f23
+ fabs $f16, $f24
+ fabs $f17, $f25
+
+ ADD $f18, $f19, $f4
+ ADD $f20, $f21, $f5
+ ADD $f22, $f23, $f6
+ ADD $f24, $f25, $f7
+
+ fcmpeq $f0, $f4, $f26
+ fcmpeq $f0, $f5, $f27
+ fcmpeq $f0, $f6, $f28
+ fcmpeq $f0, $f7, $f29
+
+ ldi $0, 1($0)
+ fbne $f26, $End
+ ldi $0, 1($0)
+ fbne $f27, $End
+ ldi $0, 1($0)
+ fbne $f28, $End
+ ldi $0, 1($0)
+ fbne $f29, $End
+ .align 4
+
+$L40:
+ LD $f10, 0 * SIZE(XX)
+ LD $f11, 1 * SIZE(XX)
+
+ addl XX, INCX, XX
+
+ fabs $f10, $f18
+ fabs $f11, $f19
+
+ ADD $f18, $f19, $f18
+ fcmpeq $f0, $f18, $f2
+
+ ldi $0, 1($0)
+ fbne $f2, $End
+ br $31, $L40
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/lsame.S b/kernel/sw_64/lsame.S
new file mode 100644
index 0000000..c2c0863
--- /dev/null
+++ b/kernel/sw_64/lsame.S
@@ -0,0 +1,77 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#include "version.h"
+
+ .set noat
+ .set noreorder
+.text
+ .align 5
+ .globl lsame_
+ .ent lsame_
+lsame_:
+ .frame $sp,0,$26,0
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ lda $28, _mcount
+ jsr $28, ($28), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ ldbu $5, 0($16)
+ ldbu $6, 0($17)
+// extb $2, $5
+// extbl $3, $6
+
+ subl $5, 96, $1
+ subl $6, 96, $2
+ subl $5, 32, $3
+ subl $6, 32, $4
+
+
+ selgt $1, $3, $5, $5
+ selgt $2, $4, $6, $6
+ cmpeq $5, $6, $0
+ .align 4
+
+$End:
+ ret
+ .end lsame_
+ .ident VERSION
diff --git a/kernel/sw_64/max.S b/kernel/sw_64/max.S
new file mode 100644
index 0000000..07925d1
--- /dev/null
+++ b/kernel/sw_64/max.S
@@ -0,0 +1,227 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#endif
+
+#define STACKSIZE 8 * 8
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef F_INTERFACE
+ ldl N, 0(N) # n
+ ldl INCX, 0(INCX) # incx
+#endif
+ ldi $sp, -STACKSIZE($sp)
+ nop
+ .align 4
+
+ cmplt $31, N, $2
+ cmplt $31, INCX, $3
+ SXADDQ INCX, $31, INCX
+ and $2, $3, $0
+
+ sra N, 3, $1
+ fclr $f0
+ unop
+ beq $0, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ LD $f0, 0 * SIZE(X)
+ unop
+ unop
+ ble $1, $L15
+ .align 4
+
+ fmov $f0, $f1
+ addl X, INCX, X
+ fmov $f0, $f10
+ ldi $1, -1($1)
+
+ LD $f21, 0 * SIZE(X)
+ fmov $f0, $f11
+ addl X, INCX, X
+ fmov $f0, $f12
+
+ LD $f22, 0 * SIZE(X)
+ fmov $f0, $f13
+ addl X, INCX, X
+ fmov $f0, $f14
+
+ LD $f23, 0 * SIZE(X)
+ fmov $f0, $f15
+ addl X, INCX, X
+ fmov $f0, $f20
+
+ LD $f24, 0 * SIZE(X)
+ addl X, INCX, X
+ LD $f25, 0 * SIZE(X)
+ addl X, INCX, X
+ LD $f26, 0 * SIZE(X)
+ addl X, INCX, X
+ LD $f27, 0 * SIZE(X)
+ addl X, INCX, X
+
+ CMPLT($f0, $f20), $f16
+ CMPLT($f1, $f21), $f17
+ CMPLT($f10, $f22), $f18
+ CMPLT($f11, $f23), $f19
+
+ ble $1, $L13
+ .align 4
+
+$L12:
+ fselne $f16, $f20, $f0, $f0
+ LD $f20, 0 * SIZE(X)
+ CMPLT($f12, $f24), $f16
+ addl X, INCX, X
+
+ fselne $f17, $f21, $f1, $f1
+ LD $f21, 0 * SIZE(X)
+ CMPLT($f13, $f25), $f17
+ addl X, INCX, X
+
+ fselne $f18, $f22, $f10, $f10
+ LD $f22, 0 * SIZE(X)
+ CMPLT($f14, $f26), $f18
+ addl X, INCX, X
+
+ fselne $f19, $f23, $f11, $f11
+ LD $f23, 0 * SIZE(X)
+ CMPLT($f15, $f27), $f19
+ addl X, INCX, X
+
+ fselne $f16, $f24, $f12, $f12
+ LD $f24, 0 * SIZE(X)
+ CMPLT($f0, $f20), $f16
+ addl X, INCX, X
+
+ fselne $f17, $f25, $f13, $f13
+ LD $f25, 0 * SIZE(X)
+ CMPLT($f1, $f21), $f17
+ addl X, INCX, X
+
+ fselne $f18, $f26, $f14, $f14
+ LD $f26, 0 * SIZE(X)
+ CMPLT($f10, $f22), $f18
+ addl X, INCX, X
+
+ fselne $f19, $f27, $f15, $f15
+ LD $f27, 0 * SIZE(X)
+ CMPLT($f11, $f23), $f19
+ ldi $1, -1($1) # i --
+
+ addl X, INCX, X
+ unop
+ unop
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ fselne $f16, $f20, $f0, $f0
+ CMPLT($f12, $f24), $f16
+
+ fselne $f17, $f21, $f1, $f1
+ CMPLT($f13, $f25), $f17
+
+ fselne $f18, $f22, $f10, $f10
+ CMPLT($f14, $f26), $f18
+
+ fselne $f19, $f23, $f11, $f11
+ CMPLT($f15, $f27), $f19
+
+ fselne $f16, $f24, $f12, $f12
+ CMPLT($f0, $f1), $f16
+ fselne $f17, $f25, $f13, $f13
+ CMPLT($f10, $f11), $f17
+
+ fselne $f18, $f26, $f14, $f14
+ CMPLT($f12, $f13), $f18
+ fselne $f19, $f27, $f15, $f15
+ CMPLT($f14, $f15), $f19
+
+ fselne $f16, $f1, $f0, $f0
+ fselne $f17, $f11, $f10, $f10
+ fselne $f18, $f13, $f12, $f12
+ fselne $f19, $f15, $f14, $f14
+
+ CMPLT($f0, $f10), $f16
+ CMPLT($f12, $f14), $f17
+
+ fselne $f16, $f10, $f0, $f0
+ fselne $f17, $f14, $f12, $f12
+
+ CMPLT($f0, $f12), $f16
+ fselne $f16, $f12, $f0, $f0
+ .align 4
+
+$L15:
+ and N, 7, $1
+ unop
+ unop
+ ble $1, $End
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ addl X, INCX, X
+
+ CMPLT($f0, $f20), $f16
+ fselne $f16, $f20, $f0, $f0
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$End:
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/nrm2_simd.S b/kernel/sw_64/nrm2_simd.S
new file mode 100644
index 0000000..0888454
--- /dev/null
+++ b/kernel/sw_64/nrm2_simd.S
@@ -0,0 +1,493 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+
+ PROFCODE
+
+
+ fclr a0
+ SXADDQ INCX, 0, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, SIZE, $0
+ fclr a3
+ beq $0, $L20 #stride access
+
+/* test the address of X */
+ and X, (VEC_LEN*SIZE-1), $3
+ fclr t0
+ nop
+ bne $3, $UnAlign_ACCESS
+/*Align access. Use simd instructions.*/
+ sra N, 4, I
+ ble I, $Remain
+
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t0 #clear s0 vector
+ VLD a1, 1*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t1
+
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t2
+ VLD a3, 3*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t3
+
+ addl X, 16 * SIZE, X
+ subl I, 1, I
+ nop
+ ble I, $MainLoopEnd
+$MainLoop:
+ fillcs PREFETCHSIZE * SIZE(X)
+ VMAD a0, a0, t0, t0
+ subl I, 1, I
+ VMAD a1, a1, t1, t1
+
+ addl X, 16 * SIZE, X
+ VMAD a2, a2, t2, t2
+ nop
+ VMAD a3, a3, t3, t3
+
+ VLD a0, -4*VEC_LEN*SIZE(X)
+ VLD a1, -3*VEC_LEN*SIZE(X)
+ VLD a2, -2*VEC_LEN*SIZE(X)
+ VLD a3, -1*VEC_LEN*SIZE(X)
+
+ bgt I, $MainLoop
+ .align 4
+$MainLoopEnd:
+ VMAD a0, a0, t0, t0
+ VMAD a1, a1, t1, t1
+ VMAD a2, a2, t2, t2
+ VMAD a3, a3, t3, t3
+
+ VADD t0, t1, a0
+ VADD t2, t3, a1
+ nop
+ VADD a0, a1, t0
+
+ vextf t0, 1, t1
+ vextf t0, 2, t2
+ vextf t0, 3, t3
+ nop
+
+ ADD t0, t1, a2
+ ADD t2, t3, a3
+ nop
+ ADD a2, a3, t0
+
+ .align 4
+$Remain:
+ and N, 15, I
+ ble I, $End
+ .align 4
+$RemainLoop:
+ LD a0, 0 * SIZE(X)
+ addl X, SIZE, X
+ MAD a0, a0, t0, t0
+ subl I, 1, I
+
+ bgt I, $RemainLoop
+ .align 4
+$End:
+ SQRT t0, a0
+ ret
+ .align 4
+
+/*Don't use simd*/
+
+$UnAlign_ACCESS:
+
+ fclr t0
+ sra N, 4, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ ADD a0, t0, a0
+ fillcs (PREFETCHSIZE) * SIZE(X)
+ MUL x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ ADD a1, t1, a1
+ mov X, XX
+ MUL x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ ADD a2, t2, a2
+ unop
+ MUL x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ ADD a3, t3, a3
+ unop
+ MUL x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ ADD a0, t0, a0
+ unop
+ MUL x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ ADD a1, t1, a1
+ unop
+ MUL x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ ADD a2, t2, a2
+ unop
+ MUL x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ ADD a3, t3, a3
+ unop
+ MUL x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ ADD a0, t0, a0
+ unop
+ MUL x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ ADD a1, t1, a1
+ ldi X, 16 * SIZE(X)
+ MUL x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ ADD a2, t2, a2
+ unop
+ MUL x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ ADD a3, t3, a3
+ unop
+ MUL x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ ADD a0, t0, a0
+ unop
+ MUL x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ ADD a1, t1, a1
+ ldi I, -1(I)
+ MUL x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ ADD a2, t2, a2
+ unop
+ MUL x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ ADD a3, t3, a3
+ MUL x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ ADD a0, t0, a0
+ mov X, XX
+ MUL x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ ADD a1, t1, a1
+ unop
+ MUL x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ ADD a2, t2, a2
+ unop
+ MUL x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ ADD a3, t3, a3
+ unop
+ MUL x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ ADD a0, t0, a0
+ unop
+ MUL x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ ADD a1, t1, a1
+ unop
+ MUL x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ ADD a2, t2, a2
+ unop
+ MUL x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ ADD a3, t3, a3
+ ldi X, 16 * SIZE(X)
+ MUL x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ ADD a0, t0, a0
+ MUL x0, x0, t0
+ ADD a1, t1, a1
+ MUL x1, x1, t1
+
+ ADD a2, t2, a2
+ MUL x2, x2, t2
+ ADD a3, t3, a3
+ MUL x3, x3, t3
+
+ ADD a0, t0, a0
+ MUL x4, x4, t0
+ ADD a1, t1, a1
+ MUL x5, x5, t1
+
+ ADD a2, t2, a2
+ MUL x6, x6, t2
+ ADD a3, t3, a3
+ MUL x7, x7, t3
+
+ ADD a1, t1, a1
+ ADD a2, t2, a2
+ ADD a3, t3, a3
+ .align 4
+
+$L15:
+ and N, 15, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ ldi X, 1 * SIZE(X)
+
+ ADD a0, t0, a0
+ MUL x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L25
+
+ fclr t2
+ fclr t3
+
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x3, 0 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x5, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x6, 0 * SIZE(X)
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ ble I, $L22
+ .align 4
+
+$L21:
+ ADD a0, t0, a0
+ LD x7, 0 * SIZE(X)
+ MUL x0, x0, t0
+ addl X, INCX, X
+
+ ADD a1, t1, a1
+ LD x0, 0 * SIZE(X)
+ MUL x1, x1, t1
+ addl X, INCX, X
+
+ ADD a2, t2, a2
+ LD x1, 0 * SIZE(X)
+ MUL x2, x2, t2
+ addl X, INCX, X
+
+ ADD a3, t3, a3
+ LD x2, 0 * SIZE(X)
+ MUL x3, x3, t3
+ addl X, INCX, X
+
+ ADD a0, t0, a0
+ LD x3, 0 * SIZE(X)
+ MUL x4, x4, t0
+ addl X, INCX, X
+
+ ADD a1, t1, a1
+ LD x4, 0 * SIZE(X)
+ MUL x5, x5, t1
+ addl X, INCX, X
+
+ ADD a2, t2, a2
+ LD x5, 0 * SIZE(X)
+ MUL x6, x6, t2
+ addl X, INCX, X
+
+ ADD a3, t3, a3
+ LD x6, 0 * SIZE(X)
+ MUL x7, x7, t3
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ bgt I, $L21
+ .align 4
+
+$L22:
+ ADD a0, t0, a0
+ LD x7, 0 * SIZE(X)
+ MUL x0, x0, t0
+ addl X, INCX, X
+
+ ADD a1, t1, a1
+ unop
+ MUL x1, x1, t1
+ unop
+
+ ADD a2, t2, a2
+ MUL x2, x2, t2
+ ADD a3, t3, a3
+ MUL x3, x3, t3
+
+ ADD a0, t0, a0
+ MUL x4, x4, t0
+ ADD a1, t1, a1
+ MUL x5, x5, t1
+
+ ADD a2, t2, a2
+ MUL x6, x6, t2
+ ADD a3, t3, a3
+ MUL x7, x7, t3
+
+ ADD a1, t1, a1
+ ADD a2, t2, a2
+ ADD a3, t3, a3
+ .align 4
+
+$L25:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+
+ ADD a0, t0, a0
+ MUL x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ ADD a0, t0, a0
+
+ ADD a0, a1, a0
+ ADD a2, a3, a2
+
+
+ ADD a0, a2, a0
+ SQRT a0, a0
+
+ .align 4
+
+$L999:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/rot.S b/kernel/sw_64/rot.S
new file mode 100644
index 0000000..3c8624e
--- /dev/null
+++ b/kernel/sw_64/rot.S
@@ -0,0 +1,680 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define I $21
+#define XX $23
+#define YY $24
+
+#define C $f10
+#define S $f11
+
+#define PREFETCH_SIZE 80
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+ ldi $sp, -16($sp)
+ fstd $f20, 8($sp)
+
+ fmov $f21, C
+ LD S, 16($sp)
+ cmpeq INCX, 1, $23
+ cmpeq INCY, 1, $24
+ ble N, $L998
+
+
+ and $23, $24, $23
+ beq $23, $L50
+
+ sra N, 3, I
+ ble I, $L15
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ LD $f16, 2*SIZE(X)
+ LD $f17, 2*SIZE(Y)
+ LD $f18, 3*SIZE(X)
+ LD $f19, 3*SIZE(Y)
+
+ MUL C, $f12, $f21
+ unop
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+
+ LD $f13, 4*SIZE(Y)
+ MUL S, $f12, $f24
+ LD $f12, 4*SIZE(X)
+ MUL C, $f14, $f25
+
+ ldi I, -1(I)
+ MUL S, $f15, $f26
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+ MUL C, $f15, $f27
+
+ LD $f15, 5*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+ ble I, $L13
+ .align 4
+
+$L12:
+ MUL C, $f16, $f21
+ flds $f31, (PREFETCH_SIZE) * SIZE(X)
+ unop
+ LD $f14, 5*SIZE(X)
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+
+ MUL C, $f17, $f23
+ flds $f31, (PREFETCH_SIZE) * SIZE(Y)
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ MUL C, $f18, $f25
+ LD $f16, 6*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ MUL C, $f12, $f21
+ LD $f18, 7*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+
+ MUL C, $f13, $f23
+ LD $f13, 8*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ MUL C, $f14, $f25
+ LD $f12, 8*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+
+ MUL C, $f15, $f27
+ LD $f15, 9*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ MUL C, $f16, $f21
+ LD $f14, 9*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+
+ MUL C, $f17, $f23
+ LD $f17, 10*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ MUL C, $f18, $f25
+ LD $f16, 10*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+
+ MUL C, $f19, $f27
+ LD $f19, 11*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ ldi I, -1(I)
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ MUL C, $f12, $f21
+ LD $f18, 11*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 6*SIZE(X)
+ MUL S, $f13, $f22
+ unop
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+
+ MUL C, $f13, $f23
+ LD $f13, 12*SIZE(Y)
+ ldi X, 8*SIZE(X)
+ unop
+
+ ST $f24, 6*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ MUL C, $f14, $f25
+ LD $f12, 4*SIZE(X)
+ ldi Y, 8*SIZE(Y)
+ unop
+
+ ST $f26, -1*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+
+ MUL C, $f15, $f27
+ LD $f15, 5*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, -1*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+ bgt I, $L12
+ .align 4
+
+$L13:
+ MUL C, $f16, $f21
+ LD $f14, 5*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ LD $f16, 6*SIZE(X)
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ SUB $f23, $f24, $f18
+ fmov $f18,$f24
+ LD $f18, 7*SIZE(X)
+
+ MUL C, $f12, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+
+ MUL C, $f13, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ MUL C, $f14, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+
+ MUL C, $f15, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ MUL C, $f16, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ ST $f22, 6*SIZE(X)
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+ ST $f24, 6*SIZE(Y)
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ ST $f26, 7*SIZE(X)
+ ldi X, 8*SIZE(X)
+ ST $f28, 7*SIZE(Y)
+ ldi Y, 8*SIZE(Y)
+ .align 4
+
+
+$L15:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f25
+ SUB $f23, $f24, $f26
+ ldi I, -1(I)
+
+ ST $f25, 0*SIZE(X)
+ ldi X, 1 * SIZE(X)
+ ST $f26, 0*SIZE(Y)
+ ldi Y, 1 * SIZE(Y)
+
+ bgt I, $L16
+ .align 4
+
+$L998:
+ clr $0
+ fldd $f20, 8($sp)
+ ldi $sp, 16($sp)
+ ret
+ .align 4
+
+$L50:
+ mov X, XX
+ mov Y, YY
+
+ sra N, 3, I
+ ble I, $L55
+ .align 4
+
+$L51:
+ LD $f12, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f13, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f14, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f16, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f17, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f18, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f19, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f16, $f21
+ MUL S, $f17, $f22
+ MUL C, $f17, $f23
+ MUL S, $f16, $f24
+
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f18, $f25
+ MUL S, $f19, $f26
+ MUL C, $f19, $f27
+ MUL S, $f18, $f28
+
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f13, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f14, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f16, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f17, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f18, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f19, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f16, $f21
+ MUL S, $f17, $f22
+ MUL C, $f17, $f23
+ MUL S, $f16, $f24
+
+ ADD $f21, $f22, $f20
+ fmov $f20,$f22
+ SUB $f23, $f24, $f20
+ fmov $f20,$f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f18, $f25
+ MUL S, $f19, $f26
+ MUL C, $f19, $f27
+ MUL S, $f18, $f28
+
+ ADD $f25, $f26, $f20
+ fmov $f20,$f26
+ SUB $f27, $f28, $f20
+ fmov $f20,$f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ ldi I, -1(I)
+ bgt I, $L51
+ .align 4
+
+$L55:
+ and N, 7, I
+ ble I, $L999
+ .align 4
+
+$L56:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f25
+ SUB $f23, $f24, $f26
+ ldi I, -1(I)
+
+ ST $f25, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ ST $f26, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ bgt I, $L56
+ .align 4
+
+$L999:
+ fldd $f20, 8($sp)
+ ldi $sp, 16($sp)
+
+ clr $0
+# fldd $f20, 8($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/rot.S.bak b/kernel/sw_64/rot.S.bak
new file mode 100644
index 0000000..62e9ff9
--- /dev/null
+++ b/kernel/sw_64/rot.S.bak
@@ -0,0 +1,624 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define I $21
+#define XX $23
+#define YY $24
+
+#define C $f10
+#define S $f11
+
+#define PREFETCH_SIZE 80
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fmov $f21, C
+ LD S, 0($sp)
+
+ cmpeq INCX, 1, $23
+ cmpeq INCY, 1, $24
+ ble N, $L998
+
+ and $23, $24, $23
+ beq $23, $L50
+
+ sra N, 3, I
+ ble I, $L15
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ LD $f16, 2*SIZE(X)
+ LD $f17, 2*SIZE(Y)
+ LD $f18, 3*SIZE(X)
+ LD $f19, 3*SIZE(Y)
+
+ MUL C, $f12, $f21
+ unop
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+
+ LD $f13, 4*SIZE(Y)
+ MUL S, $f12, $f24
+ LD $f12, 4*SIZE(X)
+ MUL C, $f14, $f25
+
+ ldi I, -1(I)
+ MUL S, $f15, $f26
+ ADD $f21, $f22, $f22
+ MUL C, $f15, $f27
+
+ LD $f15, 5*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f24
+ ble I, $L13
+ .align 4
+
+$L12:
+ MUL C, $f16, $f21
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ unop
+ LD $f14, 5*SIZE(X)
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ fillcs (PREFETCH_SIZE) * SIZE(Y)
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 6*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 7*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 8*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 8*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 9*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f16, $f21
+ LD $f14, 9*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ LD $f17, 10*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 10*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ LD $f19, 11*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ ldi I, -1(I)
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 11*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 6*SIZE(X)
+ MUL S, $f13, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 12*SIZE(Y)
+ ldi X, 8*SIZE(X)
+ unop
+
+ ST $f24, 6*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 4*SIZE(X)
+ ldi Y, 8*SIZE(Y)
+ unop
+
+ ST $f26, -1*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 5*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, -1*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f24
+ bgt I, $L12
+ .align 4
+
+$L13:
+ MUL C, $f16, $f21
+ LD $f14, 5*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ LD $f16, 6*SIZE(X)
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ LD $f18, 7*SIZE(X)
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f16, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ ST $f22, 6*SIZE(X)
+ ADD $f25, $f26, $f26
+ ST $f24, 6*SIZE(Y)
+ SUB $f27, $f28, $f28
+
+ ST $f26, 7*SIZE(X)
+ ldi X, 8*SIZE(X)
+ ST $f28, 7*SIZE(Y)
+ ldi Y, 8*SIZE(Y)
+ .align 4
+
+
+$L15:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f25
+ SUB $f23, $f24, $f26
+ ldi I, -1(I)
+
+ ST $f25, 0*SIZE(X)
+ ldi X, 1 * SIZE(X)
+ ST $f26, 0*SIZE(Y)
+ ldi Y, 1 * SIZE(Y)
+
+ bgt I, $L16
+ .align 4
+
+$L998:
+ clr $0
+ ret
+ .align 4
+
+$L50:
+ mov X, XX
+ mov Y, YY
+
+ sra N, 3, I
+ ble I, $L55
+ .align 4
+
+$L51:
+ LD $f12, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f13, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f14, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f16, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f17, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f18, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f19, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f16, $f21
+ MUL S, $f17, $f22
+ MUL C, $f17, $f23
+ MUL S, $f16, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f18, $f25
+ MUL S, $f19, $f26
+ MUL C, $f19, $f27
+ MUL S, $f18, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f13, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f14, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f16, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f17, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f18, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f19, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f16, $f21
+ MUL S, $f17, $f22
+ MUL C, $f17, $f23
+ MUL S, $f16, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f18, $f25
+ MUL S, $f19, $f26
+ MUL C, $f19, $f27
+ MUL S, $f18, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ ldi I, -1(I)
+ bgt I, $L51
+ .align 4
+
+$L55:
+ and N, 7, I
+ ble I, $L999
+ .align 4
+
+$L56:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f25
+ SUB $f23, $f24, $f26
+ ldi I, -1(I)
+
+ ST $f25, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ ST $f26, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ bgt I, $L56
+ .align 4
+
+$L999:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/rot_simd.S b/kernel/sw_64/rot_simd.S
new file mode 100644
index 0000000..99f3e05
--- /dev/null
+++ b/kernel/sw_64/rot_simd.S
@@ -0,0 +1,783 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define I $21
+#define XX $23
+#define YY $24
+
+#define C $f10
+#define S $f11
+
+#define x0 $f12
+#define x1 $f14
+#define x2 $f16
+#define x3 $f18
+
+#define y0 $f13
+#define y1 $f15
+#define y2 $f17
+#define y3 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+#define t4 $f24
+#define t5 $f25
+#define t6 $f26
+#define t7 $f27
+
+#define PREFETCHSIZE 80
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fmov $f21, C
+ LD S, 0($sp)
+
+ cmpeq INCX, 1, $23
+ cmpeq INCY, 1, $24
+ ble N, $L998
+
+ and $23, $24, $23
+ beq $23, $L50 #incx!=1 or incy !=1
+
+/* test the address of X */
+ and X, (VEC_LEN*SIZE-1), $3
+ and Y, (VEC_LEN*SIZE-1), $4
+ or $3, $4, $4
+ bne $4, $UnAlign_ACCESS
+
+/*Align Accessing*/
+ sra N, 4, I
+ ble I, $Remain
+
+ vcpyf C, C
+ vcpyf S, S
+
+ VLD x0, 0*VEC_LEN*SIZE(X)
+ VLD x1, 1*VEC_LEN*SIZE(X)
+ VLD x2, 2*VEC_LEN*SIZE(X)
+ VLD x3, 3*VEC_LEN*SIZE(X)
+
+ VLD y0, 0*VEC_LEN*SIZE(Y)
+ VLD y1, 1*VEC_LEN*SIZE(Y)
+ VLD y2, 2*VEC_LEN*SIZE(Y)
+ VLD y3, 3*VEC_LEN*SIZE(Y)
+
+ addl X, 16 * SIZE, X
+ addl Y, 16 * SIZE, Y
+ subl I, 1, I
+ ble I, $MainLoopEnd
+ .align 4
+$MainLoop:
+ VMUL C, x0, t0
+ fillcs (PREFETCHSIZE) * SIZE(X)
+ VMUL C, x1, t1
+ fillcs (PREFETCHSIZE) * SIZE(Y)
+
+ VMUL C, x2, t2
+ subl I, 1, I
+ VMUL C, x3, t3
+ nop
+
+ VMUL S, x0, t4
+ VLD x0, 0*VEC_LEN*SIZE(X)
+ VMUL S, x1, t5
+ VLD x1, 1*VEC_LEN*SIZE(X)
+
+ VMUL S, x2, t6
+ VLD x2, 2*VEC_LEN*SIZE(X)
+ VMUL S, x3, t7
+ VLD x3, 3*VEC_LEN*SIZE(X)
+
+ VMAD S, y0, t0, t0
+ VMAD S, y1, t1, t1
+ VMAD S, y2, t2, t2
+ VMAD S, y3, t3, t3
+
+ VMSUB C, y0, t4, t4
+ VLD y0, 0*VEC_LEN*SIZE(Y)
+ VMSUB C, y1, t5, t5
+ VLD y1, 1*VEC_LEN*SIZE(Y)
+
+ VMSUB C, y2, t6, t6
+ VLD y2, 2*VEC_LEN*SIZE(Y)
+ VMSUB C, y3, t7, t7
+ VLD y3, 3*VEC_LEN*SIZE(Y)
+
+ VST t0, -4*VEC_LEN*SIZE(X)
+ VST t1, -3*VEC_LEN*SIZE(X)
+ VST t2, -2*VEC_LEN*SIZE(X)
+ VST t3, -1*VEC_LEN*SIZE(X)
+
+ VST t4, -4*VEC_LEN*SIZE(Y)
+ VST t5, -3*VEC_LEN*SIZE(Y)
+ VST t6, -2*VEC_LEN*SIZE(Y)
+ VST t7, -1*VEC_LEN*SIZE(Y)
+
+ addl X, 16 * SIZE, X
+ addl Y, 16 * SIZE, Y
+ nop
+ bgt I, $MainLoop
+ .align 4
+$MainLoopEnd:
+ VMUL C, x0, t0
+ VMUL C, x1, t1
+ VMUL C, x2, t2
+ VMUL C, x3, t3
+
+ VMUL S, x0, t4
+ VMUL S, x1, t5
+ VMUL S, x2, t6
+ VMUL S, x3, t7
+
+ VMAD S, y0, t0, t0
+ VMAD S, y1, t1, t1
+ VMAD S, y2, t2, t2
+ VMAD S, y3, t3, t3
+
+ VMSUB C, y0, t4, t4
+ VMSUB C, y1, t5, t5
+ VMSUB C, y2, t6, t6
+ VMSUB C, y3, t7, t7
+
+ VST t0, -4*VEC_LEN*SIZE(X)
+ VST t1, -3*VEC_LEN*SIZE(X)
+ VST t2, -2*VEC_LEN*SIZE(X)
+ VST t3, -1*VEC_LEN*SIZE(X)
+
+ VST t4, -4*VEC_LEN*SIZE(Y)
+ VST t5, -3*VEC_LEN*SIZE(Y)
+ VST t6, -2*VEC_LEN*SIZE(Y)
+ VST t7, -1*VEC_LEN*SIZE(Y)
+
+ .align 4
+$Remain:
+ and N, 15, I
+ ble I, $End
+$RemainLoop:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f12, $f24
+ MAD S, $f13, $f21, $f25
+ MSUB C, $f13, $f24, $f26
+
+
+ ldi I, -1(I)
+ ST $f25, 0*SIZE(X)
+ ldi X, 1 * SIZE(X)
+ ST $f26, 0*SIZE(Y)
+
+ ldi Y, 1 * SIZE(Y)
+ bgt I, $RemainLoop
+
+ .align 4
+$End:
+ clr $0
+ ret
+ .align 4
+
+$UnAlign_ACCESS:
+
+ sra N, 3, I
+ ble I, $L15
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ LD $f16, 2*SIZE(X)
+ LD $f17, 2*SIZE(Y)
+ LD $f18, 3*SIZE(X)
+ LD $f19, 3*SIZE(Y)
+
+ MUL C, $f12, $f21
+ unop
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+
+ LD $f13, 4*SIZE(Y)
+ MUL S, $f12, $f24
+ LD $f12, 4*SIZE(X)
+ MUL C, $f14, $f25
+
+ ldi I, -1(I)
+ MUL S, $f15, $f26
+ ADD $f21, $f22, $f22
+ MUL C, $f15, $f27
+
+ LD $f15, 5*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f24
+ ble I, $L13
+ .align 4
+
+$L12:
+ MUL C, $f16, $f21
+ fillcs (PREFETCHSIZE) * SIZE(X)
+ unop
+ LD $f14, 5*SIZE(X)
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ fillcs (PREFETCHSIZE) * SIZE(Y)
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 6*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 7*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 8*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 8*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 9*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f16, $f21
+ LD $f14, 9*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ LD $f17, 10*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 10*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ LD $f19, 11*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ ldi I, -1(I)
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 11*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 6*SIZE(X)
+ MUL S, $f13, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 12*SIZE(Y)
+ ldi X, 8*SIZE(X)
+ unop
+
+ ST $f24, 6*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 4*SIZE(X)
+ ldi Y, 8*SIZE(Y)
+ unop
+
+ ST $f26, -1*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 5*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, -1*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f24
+ bgt I, $L12
+ .align 4
+
+$L13:
+ MUL C, $f16, $f21
+ LD $f14, 5*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ LD $f16, 6*SIZE(X)
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ LD $f18, 7*SIZE(X)
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f16, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ ST $f22, 6*SIZE(X)
+ ADD $f25, $f26, $f26
+ ST $f24, 6*SIZE(Y)
+ SUB $f27, $f28, $f28
+
+ ST $f26, 7*SIZE(X)
+ ldi X, 8*SIZE(X)
+ ST $f28, 7*SIZE(Y)
+ ldi Y, 8*SIZE(Y)
+ .align 4
+
+
+$L15:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f25
+ SUB $f23, $f24, $f26
+ ldi I, -1(I)
+
+ ST $f25, 0*SIZE(X)
+ ldi X, 1 * SIZE(X)
+ ST $f26, 0*SIZE(Y)
+ ldi Y, 1 * SIZE(Y)
+
+ bgt I, $L16
+ .align 4
+
+$L998:
+ clr $0
+ ret
+ .align 4
+
+$L50:
+ mov X, XX
+ mov Y, YY
+
+ sra N, 3, I
+ ble I, $L55
+ .align 4
+
+$L51:
+ LD $f12, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f13, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f14, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f16, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f17, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f18, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f19, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f16, $f21
+ MUL S, $f17, $f22
+ MUL C, $f17, $f23
+ MUL S, $f16, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f18, $f25
+ MUL S, $f19, $f26
+ MUL C, $f19, $f27
+ MUL S, $f18, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f13, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f14, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f16, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f17, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ LD $f18, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f19, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f16, $f21
+ MUL S, $f17, $f22
+ MUL C, $f17, $f23
+ MUL S, $f16, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ ST $f22, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f24, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ MUL C, $f18, $f25
+ MUL S, $f19, $f26
+ MUL C, $f19, $f27
+ MUL S, $f18, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f26, 0*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 0*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ ldi I, -1(I)
+ bgt I, $L51
+ .align 4
+
+$L55:
+ and N, 7, I
+ ble I, $L999
+ .align 4
+
+$L56:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f25
+ SUB $f23, $f24, $f26
+ ldi I, -1(I)
+
+ ST $f25, 0*SIZE(X)
+ SXADDQ INCX, X, X
+ ST $f26, 0*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ bgt I, $L56
+ .align 4
+
+$L999:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/scal-sw.S.bak b/kernel/sw_64/scal-sw.S.bak
new file mode 100644
index 0000000..f8da324
--- /dev/null
+++ b/kernel/sw_64/scal-sw.S.bak
@@ -0,0 +1,480 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $20
+#define INCX $21
+
+#define XX $18
+#define I $19
+
+#define ALPHA $f19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f21
+
+#define t0 $f22
+#define t1 $f23
+#define t2 $f24
+#define t3 $f25
+
+ PROLOGUE
+ PROFCODE
+
+ mov X, XX
+ ble N, $L999
+
+ cmpeq INCX, 1, $0
+ beq $0, $L20
+
+#ifndef DOUBLE
+ sra N, 4, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+
+ LD a4, 4 * SIZE(X)
+ MUL a0, ALPHA, t0
+ LD a5, 5 * SIZE(X)
+ MUL a1, ALPHA, t1
+ LD a6, 6 * SIZE(X)
+ MUL a2, ALPHA, t2
+ LD a7, 7 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ ST t0, 0 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 1 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 2 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 3 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ LD a0, 8 * SIZE(X)
+ LD a1, 9 * SIZE(X)
+ LD a2, 10 * SIZE(X)
+ LD a3, 11 * SIZE(X)
+
+ ST t0, 4 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t1, 5 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ ST t2, 6 * SIZE(X)
+ MUL a2, ALPHA, t2
+ ST t3, 7 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ LD a4, 12 * SIZE(X)
+ LD a5, 13 * SIZE(X)
+ LD a6, 14 * SIZE(X)
+ LD a7, 15 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ST t0, 8 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 9 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 10 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 11 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ LD a0, 16 * SIZE(X)
+ LD a1, 17 * SIZE(X)
+ LD a2, 18 * SIZE(X)
+ LD a3, 19 * SIZE(X)
+
+ ST t0, 12 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t1, 13 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ ST t2, 14 * SIZE(X)
+ MUL a2, ALPHA, t2
+ ST t3, 15 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ LD a4, 20 * SIZE(X)
+ LD a5, 21 * SIZE(X)
+ LD a6, 22 * SIZE(X)
+ LD a7, 23 * SIZE(X)
+
+ ST t0, 16 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 17 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 18 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 19 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ LD a0, 24 * SIZE(X)
+ LD a1, 25 * SIZE(X)
+ LD a2, 26 * SIZE(X)
+ LD a3, 27 * SIZE(X)
+
+ ST t0, 20 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t1, 21 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ ST t2, 22 * SIZE(X)
+ MUL a2, ALPHA, t2
+ ST t3, 23 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ LD a4, 28 * SIZE(X)
+ LD a5, 29 * SIZE(X)
+ LD a6, 30 * SIZE(X)
+ LD a7, 31 * SIZE(X)
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ ldi I, -1(I)
+ addl X, 16 * SIZE, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ST t0, 8 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 9 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 10 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 11 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ ST t0, 12 * SIZE(X)
+ ST t1, 13 * SIZE(X)
+ ST t2, 14 * SIZE(X)
+ ST t3, 15 * SIZE(X)
+ addl X, 16 * SIZE, X
+ .align 4
+
+$L15:
+ and N, 15, I
+
+#else
+
+ sra N, 3, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+
+ LD a4, 4 * SIZE(X)
+ MUL a0, ALPHA, t0
+ LD a5, 5 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ LD a6, 6 * SIZE(X)
+ MUL a2, ALPHA, t2
+ LD a7, 7 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ST t0, 0 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 1 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 2 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 3 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ LD a0, 8 * SIZE(X)
+ ldi I, -1(I)
+ LD a1, 9 * SIZE(X)
+ addl X, 8 * SIZE, X
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+
+ ST t0, -4 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t1, -3 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ ST t2, -2 * SIZE(X)
+ MUL a2, ALPHA, t2
+ ST t3, -1 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+ fillcs PREFETCHSIZE * SIZE(X)
+ bne I, $L12
+ .align 4
+
+$L13:
+ ST t0, 0 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 1 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 2 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 3 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ ST t0, 4 * SIZE(X)
+ ST t1, 5 * SIZE(X)
+ ST t2, 6 * SIZE(X)
+ ST t3, 7 * SIZE(X)
+ addl X, 8 * SIZE, X
+ .align 4
+
+$L15:
+ and N, 7, I
+
+#endif
+
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ LD a0, 0 * SIZE(X)
+
+ MUL a0, ALPHA, t0
+
+ ST t0, 0 * SIZE(X)
+
+ addl X, SIZE, X
+
+ ldi I, -1(I)
+ bne I, $L17
+ ret
+ .align 4
+
+$L20:
+ sra N, 3, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ldi I, -1(I)
+ SXADDQ INCX, X, X
+
+ LD a5, 0 * SIZE(X)
+ MUL a1, ALPHA, t1
+ SXADDQ INCX, X, X
+ unop
+
+ LD a6, 0 * SIZE(X)
+ MUL a2, ALPHA, t2
+ SXADDQ INCX, X, X
+ unop
+
+ LD a7, 0 * SIZE(X)
+ MUL a3, ALPHA, t3
+ SXADDQ INCX, X, X
+ ble I, $L23
+ .align 4
+
+$L22:
+ ST t0, 0 * SIZE(XX)
+ MUL a4, ALPHA, t0
+ fillcs PREFETCHSIZE * SIZE(X)
+ SXADDQ INCX, XX, XX
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ ldi I, -1(I)
+ unop
+
+ ST t1, 0 * SIZE(XX)
+ MUL a5, ALPHA, t1
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t2, 0 * SIZE(XX)
+ MUL a6, ALPHA, t2
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t3, 0 * SIZE(XX)
+ MUL a7, ALPHA, t3
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t0, 0 * SIZE(XX)
+ MUL a0, ALPHA, t0
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a4, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t1, 0 * SIZE(XX)
+ MUL a1, ALPHA, t1
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a5, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t2, 0 * SIZE(XX)
+ MUL a2, ALPHA, t2
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a6, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t3, 0 * SIZE(XX)
+ MUL a3, ALPHA, t3
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a7, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ unop
+ bne I, $L22
+ .align 4
+
+$L23:
+ ST t0, 0 * SIZE(XX)
+ MUL a4, ALPHA, t0
+ SXADDQ INCX, XX, XX
+
+ ST t1, 0 * SIZE(XX)
+ MUL a5, ALPHA, t1
+ SXADDQ INCX, XX, XX
+
+ ST t2, 0 * SIZE(XX)
+ MUL a6, ALPHA, t2
+ SXADDQ INCX, XX, XX
+
+ ST t3, 0 * SIZE(XX)
+ MUL a7, ALPHA, t3
+ SXADDQ INCX, XX, XX
+
+ ST t0, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t1, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t2, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t3, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ .align 4
+
+$L25:
+ and N, 7, I
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$L27:
+ LD a0, 0 * SIZE(X)
+
+ MUL a0, ALPHA, t0
+
+ ST t0, 0 * SIZE(XX)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCX, XX, XX
+
+ ldi I, -1(I)
+ bne I, $L27
+ .align 4
+
+$L999:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/scal.S b/kernel/sw_64/scal.S
new file mode 100644
index 0000000..87b89c9
--- /dev/null
+++ b/kernel/sw_64/scal.S
@@ -0,0 +1,480 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $20
+#define INCX $21
+
+#define XX $18
+#define I $19
+
+#define ALPHA $f19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f21
+
+#define t0 $f22
+#define t1 $f23
+#define t2 $f24
+#define t3 $f25
+
+ PROLOGUE
+ PROFCODE
+
+ mov X, XX
+ ble N, $L999
+
+ cmpeq INCX, 1, $0
+ beq $0, $L20
+
+#ifndef DOUBLE
+ sra N, 4, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+
+ LD a4, 4 * SIZE(X)
+ MUL a0, ALPHA, t0
+ LD a5, 5 * SIZE(X)
+ MUL a1, ALPHA, t1
+ LD a6, 6 * SIZE(X)
+ MUL a2, ALPHA, t2
+ LD a7, 7 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ ST t0, 0 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 1 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 2 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 3 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ LD a0, 8 * SIZE(X)
+ LD a1, 9 * SIZE(X)
+ LD a2, 10 * SIZE(X)
+ LD a3, 11 * SIZE(X)
+
+ ST t0, 4 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t1, 5 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ ST t2, 6 * SIZE(X)
+ MUL a2, ALPHA, t2
+ ST t3, 7 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ LD a4, 12 * SIZE(X)
+ LD a5, 13 * SIZE(X)
+ LD a6, 14 * SIZE(X)
+ LD a7, 15 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ST t0, 8 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 9 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 10 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 11 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ LD a0, 16 * SIZE(X)
+ LD a1, 17 * SIZE(X)
+ LD a2, 18 * SIZE(X)
+ LD a3, 19 * SIZE(X)
+
+ ST t0, 12 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t1, 13 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ ST t2, 14 * SIZE(X)
+ MUL a2, ALPHA, t2
+ ST t3, 15 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ LD a4, 20 * SIZE(X)
+ LD a5, 21 * SIZE(X)
+ LD a6, 22 * SIZE(X)
+ LD a7, 23 * SIZE(X)
+
+ ST t0, 16 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 17 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 18 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 19 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ LD a0, 24 * SIZE(X)
+ LD a1, 25 * SIZE(X)
+ LD a2, 26 * SIZE(X)
+ LD a3, 27 * SIZE(X)
+
+ ST t0, 20 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t1, 21 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ ST t2, 22 * SIZE(X)
+ MUL a2, ALPHA, t2
+ ST t3, 23 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ LD a4, 28 * SIZE(X)
+ LD a5, 29 * SIZE(X)
+ LD a6, 30 * SIZE(X)
+ LD a7, 31 * SIZE(X)
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ ldi I, -1(I)
+ addl X, 16 * SIZE, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ST t0, 8 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 9 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 10 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 11 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ ST t0, 12 * SIZE(X)
+ ST t1, 13 * SIZE(X)
+ ST t2, 14 * SIZE(X)
+ ST t3, 15 * SIZE(X)
+ addl X, 16 * SIZE, X
+ .align 4
+
+$L15:
+ and N, 15, I
+
+#else
+
+ sra N, 3, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+
+ LD a4, 4 * SIZE(X)
+ MUL a0, ALPHA, t0
+ LD a5, 5 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ LD a6, 6 * SIZE(X)
+ MUL a2, ALPHA, t2
+ LD a7, 7 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ST t0, 0 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 1 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 2 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 3 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ LD a0, 8 * SIZE(X)
+ ldi I, -1(I)
+ LD a1, 9 * SIZE(X)
+ addl X, 8 * SIZE, X
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+
+ ST t0, -4 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t1, -3 * SIZE(X)
+ MUL a1, ALPHA, t1
+
+ ST t2, -2 * SIZE(X)
+ MUL a2, ALPHA, t2
+ ST t3, -1 * SIZE(X)
+ MUL a3, ALPHA, t3
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+ fillcs PREFETCHSIZE * SIZE(X)
+ bne I, $L12
+ .align 4
+
+$L13:
+ ST t0, 0 * SIZE(X)
+ MUL a4, ALPHA, t0
+ ST t1, 1 * SIZE(X)
+ MUL a5, ALPHA, t1
+
+ ST t2, 2 * SIZE(X)
+ MUL a6, ALPHA, t2
+ ST t3, 3 * SIZE(X)
+ MUL a7, ALPHA, t3
+
+ ST t0, 4 * SIZE(X)
+ ST t1, 5 * SIZE(X)
+ ST t2, 6 * SIZE(X)
+ ST t3, 7 * SIZE(X)
+ addl X, 8 * SIZE, X
+ .align 4
+
+$L15:
+ and N, 7, I
+
+#endif
+
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ LD a0, 0 * SIZE(X)
+
+ MUL a0, ALPHA, t0
+
+ ST t0, 0 * SIZE(X)
+
+ addl X, SIZE, X
+
+ ldi I, -1(I)
+ bne I, $L17
+ ret
+ .align 4
+
+$L20:
+ sra N, 3, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ldi I, -1(I)
+ SXADDQ INCX, X, X
+
+ LD a5, 0 * SIZE(X)
+ MUL a1, ALPHA, t1
+ SXADDQ INCX, X, X
+ unop
+
+ LD a6, 0 * SIZE(X)
+ MUL a2, ALPHA, t2
+ SXADDQ INCX, X, X
+ unop
+
+ LD a7, 0 * SIZE(X)
+ MUL a3, ALPHA, t3
+ SXADDQ INCX, X, X
+ ble I, $L23
+ .align 4
+
+$L22:
+ ST t0, 0 * SIZE(XX)
+ MUL a4, ALPHA, t0
+ fillcs PREFETCHSIZE * SIZE(X)
+ SXADDQ INCX, XX, XX
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ ldi I, -1(I)
+ unop
+
+ ST t1, 0 * SIZE(XX)
+ MUL a5, ALPHA, t1
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t2, 0 * SIZE(XX)
+ MUL a6, ALPHA, t2
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t3, 0 * SIZE(XX)
+ MUL a7, ALPHA, t3
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t0, 0 * SIZE(XX)
+ MUL a0, ALPHA, t0
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a4, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t1, 0 * SIZE(XX)
+ MUL a1, ALPHA, t1
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a5, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t2, 0 * SIZE(XX)
+ MUL a2, ALPHA, t2
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a6, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t3, 0 * SIZE(XX)
+ MUL a3, ALPHA, t3
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a7, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ unop
+ bne I, $L22
+ .align 4
+
+$L23:
+ ST t0, 0 * SIZE(XX)
+ MUL a4, ALPHA, t0
+ SXADDQ INCX, XX, XX
+
+ ST t1, 0 * SIZE(XX)
+ MUL a5, ALPHA, t1
+ SXADDQ INCX, XX, XX
+
+ ST t2, 0 * SIZE(XX)
+ MUL a6, ALPHA, t2
+ SXADDQ INCX, XX, XX
+
+ ST t3, 0 * SIZE(XX)
+ MUL a7, ALPHA, t3
+ SXADDQ INCX, XX, XX
+
+ ST t0, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t1, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t2, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t3, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ .align 4
+
+$L25:
+ and N, 7, I
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$L27:
+ LD a0, 0 * SIZE(X)
+
+ MUL a0, ALPHA, t0
+
+ ST t0, 0 * SIZE(XX)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCX, XX, XX
+
+ ldi I, -1(I)
+ bne I, $L27
+ .align 4
+
+$L999:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/scal_simd.S b/kernel/sw_64/scal_simd.S
new file mode 100644
index 0000000..7462e99
--- /dev/null
+++ b/kernel/sw_64/scal_simd.S
@@ -0,0 +1,344 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 144
+
+#define N $16
+#define X $20
+#define INCX $21
+
+#define XX $18
+#define I $19
+
+#define ALPHA $f19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f21
+
+#define t0 $f22
+#define t1 $f23
+#define t2 $f24
+#define t3 $f25
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+ mov X, XX
+ ble N, $L999
+
+ cmpeq INCX, 1, $0
+ beq $0, $L20
+
+/**
+ test the address of X
+**/
+ and X, (VEC_LEN*SIZE-1), $4
+ beq $4, $Align_X_Access
+
+ .align 5
+/**
+ process the unalign address of X
+**/
+ sra N, 4, I
+ ble I, $Remain /*if N is too small(less then unroll size), don't need process unalign X. Just jump to remain section.*/
+
+ sra $4, BASE_SHIFT, $4
+ ldi $3, VEC_LEN
+ subl $3, $4, $4
+ subl N, $4, N
+
+$UnAlign_X_Loop:
+ LD a0, 0*SIZE(X)
+ MUL a0, ALPHA, t0
+ ST t0, 0*SIZE(X)
+ addl X, SIZE, X
+
+
+
+ subl $4, 1, $4
+ bgt $4, $UnAlign_X_Loop
+ .align 5
+
+$Align_X_Access:
+
+/*
+ Unloop 16
+*/
+ sra N, 4, I
+ vcpyf ALPHA, ALPHA
+ ble I, $Remain
+
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ VLD a1, 1*VEC_LEN*SIZE(X)
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ VLD a3, 3*VEC_LEN*SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $MainLoop_End
+ .align 5
+$MainLoop:
+ VMUL a0, ALPHA, t0
+ VLD a0, 4*VEC_LEN*SIZE(X)
+ VMUL a1, ALPHA, t1
+ VLD a1, 5*VEC_LEN*SIZE(X)
+
+ VMUL a2, ALPHA, t2
+ VLD a2, 6*VEC_LEN*SIZE(X)
+ VMUL a3, ALPHA, t3
+ VLD a3, 7*VEC_LEN*SIZE(X)
+
+ VST t0, 0*VEC_LEN*SIZE(X)
+ VST t1, 1*VEC_LEN*SIZE(X)
+ VST t2, 2*VEC_LEN*SIZE(X)
+ VST t3, 3*VEC_LEN*SIZE(X)
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ ldi I, -1(I)
+ addl X, 16 * SIZE, X
+ bne I, $MainLoop
+ .align 5
+
+$MainLoop_End:
+ VMUL a0, ALPHA, t0
+ VST t0, 0*VEC_LEN*SIZE(X)
+ VMUL a1, ALPHA, t1
+ VST t1, 1*VEC_LEN*SIZE(X)
+
+ VMUL a2, ALPHA, t2
+ VST t2, 2*VEC_LEN*SIZE(X)
+ VMUL a3, ALPHA, t3
+ VST t3, 3*VEC_LEN*SIZE(X)
+
+ addl X, 16 * SIZE, X
+ .align 5
+
+$Remain:
+ and N, 15, I
+ unop
+ unop
+ ble I, $L999
+ .align 5
+
+$L17:
+ LD a0, 0 * SIZE(X)
+
+ MUL a0, ALPHA, t0
+
+ ST t0, 0 * SIZE(X)
+
+ addl X, SIZE, X
+
+ ldi I, -1(I)
+ bne I, $L17
+ ret
+ .align 5
+
+$L20:
+ sra N, 3, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ MUL a0, ALPHA, t0
+ ldi I, -1(I)
+ SXADDQ INCX, X, X
+
+ LD a5, 0 * SIZE(X)
+ MUL a1, ALPHA, t1
+ SXADDQ INCX, X, X
+ unop
+
+ LD a6, 0 * SIZE(X)
+ MUL a2, ALPHA, t2
+ SXADDQ INCX, X, X
+ unop
+
+ LD a7, 0 * SIZE(X)
+ MUL a3, ALPHA, t3
+ SXADDQ INCX, X, X
+ ble I, $L23
+ .align 5
+
+$L22:
+ ST t0, 0 * SIZE(XX)
+ MUL a4, ALPHA, t0
+/*
+ fillcs PREFETCHSIZE * SIZE(X)
+*/
+ fillcs PREFETCHSIZE * SIZE(X)
+ SXADDQ INCX, XX, XX
+
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ ldi I, -1(I)
+ unop
+
+ ST t1, 0 * SIZE(XX)
+ MUL a5, ALPHA, t1
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a1, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t2, 0 * SIZE(XX)
+ MUL a6, ALPHA, t2
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t3, 0 * SIZE(XX)
+ MUL a7, ALPHA, t3
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t0, 0 * SIZE(XX)
+ MUL a0, ALPHA, t0
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a4, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t1, 0 * SIZE(XX)
+ MUL a1, ALPHA, t1
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a5, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t2, 0 * SIZE(XX)
+ MUL a2, ALPHA, t2
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a6, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ST t3, 0 * SIZE(XX)
+ MUL a3, ALPHA, t3
+ SXADDQ INCX, XX, XX
+ unop
+
+ LD a7, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ unop
+ bne I, $L22
+ .align 5
+
+$L23:
+ ST t0, 0 * SIZE(XX)
+ MUL a4, ALPHA, t0
+ SXADDQ INCX, XX, XX
+
+ ST t1, 0 * SIZE(XX)
+ MUL a5, ALPHA, t1
+ SXADDQ INCX, XX, XX
+
+ ST t2, 0 * SIZE(XX)
+ MUL a6, ALPHA, t2
+ SXADDQ INCX, XX, XX
+
+ ST t3, 0 * SIZE(XX)
+ MUL a7, ALPHA, t3
+ SXADDQ INCX, XX, XX
+
+ ST t0, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t1, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t2, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST t3, 0 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ .align 5
+
+$L25:
+ and N, 7, I
+ unop
+ unop
+ ble I, $L999
+ .align 5
+
+$L27:
+ LD a0, 0 * SIZE(X)
+
+ MUL a0, ALPHA, t0
+
+ ST t0, 0 * SIZE(XX)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCX, XX, XX
+
+ ldi I, -1(I)
+ bne I, $L27
+ .align 5
+
+$L999:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/snrm2.S b/kernel/sw_64/snrm2.S
new file mode 100644
index 0000000..ff1ec57
--- /dev/null
+++ b/kernel/sw_64/snrm2.S
@@ -0,0 +1,491 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCH_SIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+#define x8 $f24
+
+ PROLOGUE
+
+#if defined(EV4) || defined(EV5)
+ .frame $30,16,$26,0
+ .mask 0x4000000,-16
+ ldih $29, 0($27) !gpdisp!1
+ ldi $29, 0($29) !gpdisp!1
+
+ ldi $sp, -16($sp)
+ ldl $27, sqrt($29) !literal!2
+ stl $26, 0($sp)
+
+ PROFCODE
+ .prologue 1
+#else
+ PROFCODE
+#endif
+
+ fclr a0
+ SXADDQ INCX, 0, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, SIZE, $0
+ fclr a3
+ beq $0, $L20
+
+ fclr t0
+ sra N, 4, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ faddd a0, t0, x8
+ fmov x8,a0
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1,x8
+ fmov x8,a1
+ mov X, XX
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2,x8
+ fmov x8,a2
+ #unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3,x8
+ fmov x8,a3
+ #unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, x8
+ fmov x8,a0
+ #unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ faddd a1, t1, x8
+ fmov x8,a1
+ #unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ faddd a2, t2, x8
+ fmov x8,a2
+ #unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ faddd a3, t3, x8
+ fmov x8,a3
+ #unop
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ faddd a0, t0, x8
+ fmov x8,a0
+ #unop
+ fmuld x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ faddd a1, t1,x8
+ fmov x8,a1
+ ldi X, 16 * SIZE(X)
+ fmuld x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ faddd a2, t2, x8
+ fmov x8,a2
+ #unop
+ fmuld x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ faddd a3, t3,x8
+ fmov x8,a3
+ #unop
+ fmuld x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ faddd a0, t0, x8
+ fmov x8,a0
+ #unop
+ fmuld x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ faddd a1, t1,x8
+ fmov x8,a1
+ ldi I, -1(I)
+ fmuld x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ faddd a2, t2, x8
+ fmov x8,a2
+ #unop
+ fmuld x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ faddd a3, t3,x8
+ fmov x8,a3
+ fmuld x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ faddd a0, t0,x8
+ fmov x8,a0
+ mov X, XX
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1,x8
+ fmov x8,a1
+ #unop
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2,x8
+ fmov x8,a2
+ #unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, x8
+ fmov x8,a3
+ #unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, x8
+ fmov x8,a0
+ #unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ faddd a1, t1, x8
+ fmov x8,a1
+ #unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ faddd a2, t2, x8
+ fmov x8,a2
+ #unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ faddd a3, t3,x8
+ fmov x8,a3
+ ldi X, 16 * SIZE(X)
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ faddd a0, t0,x8
+ fmov x8,a0
+ fmuld x0, x0, t0
+ faddd a1, t1, x8
+ fmov x8,a1
+ fmuld x1, x1, t1
+
+ faddd a2, t2, x8
+ fmov x8,a2
+ fmuld x2, x2, t2
+ faddd a3, t3, x8
+ fmov x8,a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, x8
+ fmov x8,a0
+ fmuld x4, x4, t0
+ faddd a1, t1, x8
+ fmov x8,a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, x8
+ fmov x8,a2
+ fmuld x6, x6, t2
+ faddd a3, t3, x8
+ fmov x8,a3
+ fmuld x7, x7, t3
+
+ faddd a1, t1, x8
+ fmov x8,a1
+ faddd a2, t2, x8
+ fmov x8,a2
+ faddd a3, t3, x8
+ fmov x8,a3
+ .align 4
+
+$L15:
+ and N, 15, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ ldi X, 1 * SIZE(X)
+
+ faddd a0, t0,x8
+ fmov x8,a0
+ fmuld x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L25
+
+ fclr t2
+ fclr t3
+
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x3, 0 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x5, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x6, 0 * SIZE(X)
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ ble I, $L22
+ .align 4
+
+$L21:
+ faddd a0, t0, x8
+ fmov x8,a0
+ LD x7, 0 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1,x8
+ fmov x8,a1
+ LD x0, 0 * SIZE(X)
+ fmuld x1, x1, t1
+ addl X, INCX, X
+
+ faddd a2, t2,x8
+ fmov x8,a2
+ LD x1, 0 * SIZE(X)
+ fmuld x2, x2, t2
+ addl X, INCX, X
+
+ faddd a3, t3,x8
+ fmov x8,a3
+ LD x2, 0 * SIZE(X)
+ fmuld x3, x3, t3
+ addl X, INCX, X
+
+ faddd a0, t0,x8
+ fmov x8,a0
+ LD x3, 0 * SIZE(X)
+ fmuld x4, x4, t0
+ addl X, INCX, X
+
+ faddd a1, t1,x8
+ fmov x8,a1
+ LD x4, 0 * SIZE(X)
+ fmuld x5, x5, t1
+ addl X, INCX, X
+
+ faddd a2, t2,x8
+ fmov x8,a2
+ LD x5, 0 * SIZE(X)
+ fmuld x6, x6, t2
+ addl X, INCX, X
+
+ faddd a3, t3, x8
+ fmov x8,a3
+ LD x6, 0 * SIZE(X)
+ fmuld x7, x7, t3
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ bgt I, $L21
+ .align 4
+
+$L22:
+ faddd a0, t0,x8
+ fmov x8,a0
+ LD x7, 0 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, x8
+ fmov x8,a1
+ unop
+ fmuld x1, x1, t1
+ unop
+
+ faddd a2, t2,x8
+ fmov x8,a2
+ fmuld x2, x2, t2
+ faddd a3, t3, x8
+ fmov x8,a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, x8
+ fmov x8,a0
+ fmuld x4, x4, t0
+ faddd a1, t1, x8
+ fmov x8,a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, x8
+ fmov x8,a2
+ fmuld x6, x6, t2
+ faddd a3, t3, x8
+ fmov x8,a3
+ fmuld x7, x7, t3
+
+ faddd a1, t1, x8
+ fmov x8,a1
+ faddd a2, t2, x8
+ fmov x8,a2
+ faddd a3, t3, x8
+ fmov x8,a3
+ .align 4
+
+$L25:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+
+ faddd a0, t0,x8
+ fmov x8,a0
+ fmuld x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ faddd a0, t0,x8
+ fmov x8,a0
+
+ faddd a0, a1, x8
+ fmov x8,a1
+ faddd a2, a3, x8
+ fmov x8,a2
+
+#if defined(EV4) || defined(EV5)
+ faddd a0, a2, $f16
+ jsr $26, ($27), sqrt !lituse_jsr!2
+
+ ldih $29, 0($26) !gpdisp!3
+ ldi $29, 0($29) !gpdisp!3
+#else
+ faddd a0, a2,x8
+ fsqrtd x8, a0
+#endif
+ .align 4
+
+$L999:
+#if defined(EV4) || defined(EV5)
+ ldl $26, 0($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/snrm2.S.bak b/kernel/sw_64/snrm2.S.bak
new file mode 100644
index 0000000..753c90b
--- /dev/null
+++ b/kernel/sw_64/snrm2.S.bak
@@ -0,0 +1,431 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCH_SIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+#if defined(EV4) || defined(EV5)
+ .frame $30,16,$26,0
+ .mask 0x4000000,-16
+ ldih $29, 0($27) !gpdisp!1
+ ldi $29, 0($29) !gpdisp!1
+
+ ldi $sp, -16($sp)
+ ldl $27, sqrt($29) !literal!2
+ stq $26, 0($sp)
+
+ PROFCODE
+ .prologue 1
+#else
+ PROFCODE
+#endif
+
+ fclr a0
+ SXADDQ INCX, 0, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, SIZE, $0
+ fclr a3
+ beq $0, $L20
+
+ fclr t0
+ sra N, 4, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ faddd a0, t0, a0
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, a1
+ mov X, XX
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ faddd a1, t1, a1
+ ldi X, 16 * SIZE(X)
+ fmuld x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ faddd a1, t1, a1
+ ldi I, -1(I)
+ fmuld x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ faddd a0, t0, a0
+ mov X, XX
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ faddd a3, t3, a3
+ ldi X, 16 * SIZE(X)
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+
+ faddd a2, t2, a2
+ fmuld x2, x2, t2
+ faddd a3, t3, a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, a0
+ fmuld x4, x4, t0
+ faddd a1, t1, a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, a2
+ fmuld x6, x6, t2
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+
+ faddd a1, t1, a1
+ faddd a2, t2, a2
+ faddd a3, t3, a3
+ .align 4
+
+$L15:
+ and N, 15, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ ldi X, 1 * SIZE(X)
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L25
+
+ fclr t2
+ fclr t3
+
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x1, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x3, 0 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x5, 0 * SIZE(X)
+ addl X, INCX, X
+ LD x6, 0 * SIZE(X)
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ ble I, $L22
+ .align 4
+
+$L21:
+ faddd a0, t0, a0
+ LD x7, 0 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ LD x0, 0 * SIZE(X)
+ fmuld x1, x1, t1
+ addl X, INCX, X
+
+ faddd a2, t2, a2
+ LD x1, 0 * SIZE(X)
+ fmuld x2, x2, t2
+ addl X, INCX, X
+
+ faddd a3, t3, a3
+ LD x2, 0 * SIZE(X)
+ fmuld x3, x3, t3
+ addl X, INCX, X
+
+ faddd a0, t0, a0
+ LD x3, 0 * SIZE(X)
+ fmuld x4, x4, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ LD x4, 0 * SIZE(X)
+ fmuld x5, x5, t1
+ addl X, INCX, X
+
+ faddd a2, t2, a2
+ LD x5, 0 * SIZE(X)
+ fmuld x6, x6, t2
+ addl X, INCX, X
+
+ faddd a3, t3, a3
+ LD x6, 0 * SIZE(X)
+ fmuld x7, x7, t3
+ addl X, INCX, X
+
+ ldi I, -1(I)
+ bgt I, $L21
+ .align 4
+
+$L22:
+ faddd a0, t0, a0
+ LD x7, 0 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ unop
+ fmuld x1, x1, t1
+ unop
+
+ faddd a2, t2, a2
+ fmuld x2, x2, t2
+ faddd a3, t3, a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, a0
+ fmuld x4, x4, t0
+ faddd a1, t1, a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, a2
+ fmuld x6, x6, t2
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+
+ faddd a1, t1, a1
+ faddd a2, t2, a2
+ faddd a3, t3, a3
+ .align 4
+
+$L25:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ addl X, INCX, X
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+
+ ldi I, -1(I)
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ faddd a0, t0, a0
+
+ faddd a0, a1, a0
+ faddd a2, a3, a2
+
+#if defined(EV4) || defined(EV5)
+ faddd a0, a2, $f16
+ jsr $26, ($27), sqrt !lituse_jsr!2
+
+ ldih $29, 0($26) !gpdisp!3
+ ldi $29, 0($29) !gpdisp!3
+#else
+ faddd a0, a2, a0
+ fsqrtd a0, a0
+#endif
+ .align 4
+
+$L999:
+#if defined(EV4) || defined(EV5)
+ ldl $26, 0($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/staticbuffer.S b/kernel/sw_64/staticbuffer.S
new file mode 100644
index 0000000..7bbd23d
--- /dev/null
+++ b/kernel/sw_64/staticbuffer.S
@@ -0,0 +1,45 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#ifdef ALLOC_STATIC
+ .align 8
+ .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384
+#endif
diff --git a/kernel/sw_64/sum.S b/kernel/sw_64/sum.S
new file mode 100644
index 0000000..0be6d53
--- /dev/null
+++ b/kernel/sw_64/sum.S
@@ -0,0 +1,230 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define I $19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+
+ PROLOGUE
+ PROFCODE
+
+ fclr s0
+ unop
+ fclr t0
+ ble N, $L999
+
+ sra N, 3, I
+ fclr s1
+ fclr s2
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ fclr t1
+ SXADDQ INCX, X, X
+ fclr t2
+
+ LD a1, 0 * SIZE(X)
+ fclr t3
+ SXADDQ INCX, X, X
+ fclr s3
+
+ LD a2, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a3, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a5, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ ldw $31, PREFETCHSIZE * 2 * SIZE(X)
+ fmov a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a6, 0 * SIZE(X)
+ fmov a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ LD a7, 0 * SIZE(X)
+ fmov a2, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ LD a0, 0 * SIZE(X)
+ fmov a3, t3
+ SXADDQ INCX, X, X
+
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ LD a1, 0 * SIZE(X)
+ fmov a4, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a2, 0 * SIZE(X)
+ fmov a5, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ LD a3, 0 * SIZE(X)
+ fmov a6, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ LD a4, 0 * SIZE(X)
+ fmov a7, t3
+ SXADDQ INCX, X, X
+
+ LD a5, 0 * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ LD a6, 0 * SIZE(X)
+ fmov a0, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a7, 0 * SIZE(X)
+ fmov a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ fmov a2, t2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ fmov a3, t3
+
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ fmov a4, t0
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ fmov a5, t1
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ fmov a6, t2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ fmov a7, t3
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+
+ ADD s0, s1, $f24
+ fmov $f24,s0
+ ADD s2, s3, $f24
+ fmov $f24,s2
+ .align 4
+
+$L15:
+ and N, 7, I
+ ADD s0, s2, $f24
+ fmov $f24,s0
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ LD a0, 0 * SIZE(X)
+ SXADDQ INCX, X, X
+ fmov a0, t0
+
+ ldi I, -1(I)
+ bne I, $L17
+ .align 4
+
+$L999:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/sw_fpcr.S b/kernel/sw_64/sw_fpcr.S
new file mode 100644
index 0000000..5dee238
--- /dev/null
+++ b/kernel/sw_64/sw_fpcr.S
@@ -0,0 +1,39 @@
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+ .arch sw2b
+ .set noat
+ .set noreorder
+.text
+ .align 5
+ .globl read_fpcr
+ .ent read_fpcr
+read_fpcr:
+ .frame $sp, 0, $26, 0
+ RFPCR $f10
+ fstd $f10, 0($16)
+ ret
+ .end read_fpcr
+
+ .globl write_fpcr
+ .ent write_fpcr
+write_fpcr:
+ .frame $sp, 0, $26, 0
+ fldd $f10, 0($16)
+ WFPCR $f10
+ ret
+ .end write_fpcr
+/**
+ .globl fadd_test
+ .ent fadd_test
+
+fadd_test:
+ .frame $sp, 0, $26, 0
+ faddd $f16, $f17, $f16
+ fmov $f16, $f0
+ ret
+ .end fadd_test
+**/
+ .ident VERSION
+
diff --git a/kernel/sw_64/sw_fpcr_inline.c b/kernel/sw_64/sw_fpcr_inline.c
new file mode 100644
index 0000000..1943e3e
--- /dev/null
+++ b/kernel/sw_64/sw_fpcr_inline.c
@@ -0,0 +1,13 @@
+#include "common.h"
+
+void read_fpcr(long * test){
+
+ __asm__("rfpcr $f10 \n fstd $f10, %0":"=m"(*test):);
+ return;
+}
+
+void write_fpcr(long * test){
+
+ __asm__("fldd $f10, %0\nwfpcr $f10"::"m"(*test));
+ return;
+}
diff --git a/kernel/sw_64/swap.S b/kernel/sw_64/swap.S
new file mode 100644
index 0000000..5c8b679
--- /dev/null
+++ b/kernel/sw_64/swap.S
@@ -0,0 +1,249 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+ mov $20, $17
+ mov $21, $18
+ ldl $19, 0($sp)
+ ldl $20, 8($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ subl $18, 1, $1
+ subl $20, 1, $2
+ ble $16, $SubEnd # if n <= 0 goto $End
+ or $1, $2, $1
+
+ sra $16, 3, $21
+
+ and $16, 7, $22
+ bne $1, $Sub
+ ble $21, $MainRemain
+ .align 4
+
+$MainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f11, 1*SIZE($19)
+ LD $f12, 2*SIZE($19)
+ LD $f13, 3*SIZE($19)
+ LD $f14, 4*SIZE($19)
+ LD $f15, 5*SIZE($19)
+ LD $f16, 6*SIZE($19)
+ LD $f17, 7*SIZE($19)
+
+ LD $f20, 0*SIZE($17)
+ LD $f21, 1*SIZE($17)
+ LD $f22, 2*SIZE($17)
+ LD $f23, 3*SIZE($17)
+ LD $f24, 4*SIZE($17)
+ LD $f25, 5*SIZE($17)
+ LD $f26, 6*SIZE($17)
+ LD $f27, 7*SIZE($17)
+
+ fillcs 32*SIZE($17)
+ unop
+ fillcs 32*SIZE($19)
+ subl $21, 1, $21
+
+ ST $f10, 0*SIZE($17)
+ ST $f11, 1*SIZE($17)
+ ST $f12, 2*SIZE($17)
+ ST $f13, 3*SIZE($17)
+ ST $f14, 4*SIZE($17)
+ ST $f15, 5*SIZE($17)
+ ST $f16, 6*SIZE($17)
+ ST $f17, 7*SIZE($17)
+
+ ST $f20, 0*SIZE($19)
+ ST $f21, 1*SIZE($19)
+ ST $f22, 2*SIZE($19)
+ ST $f23, 3*SIZE($19)
+ ST $f24, 4*SIZE($19)
+ ST $f25, 5*SIZE($19)
+ ST $f26, 6*SIZE($19)
+ ST $f27, 7*SIZE($19)
+
+ ldi $17, 8*SIZE($17)
+ ldi $19, 8*SIZE($19)
+ bgt $21, $MainLoop
+ .align 4
+
+$MainRemain:
+ ble $22, $MainEnd
+ .align 4
+
+$MainRemainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f20, 0*SIZE($17)
+ ldi $17, 1*SIZE($17)
+ ldi $19, 1*SIZE($19)
+ subl $22, 1, $22
+ ST $f10, -1*SIZE($17)
+ ST $f20, -1*SIZE($19)
+ bgt $22, $MainRemainLoop
+ .align 4
+
+$MainEnd:
+ clr $0
+ ret
+ .align 4
+
+$Sub:
+ mov $17, $23
+ mov $19, $24
+
+ ble $21, $SubRemain
+ .align 4
+
+$SubLoop:
+ LD $f10, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+ LD $f11, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f12, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+ LD $f13, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f14, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+ LD $f15, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f16, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+ LD $f17, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f20, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+ LD $f21, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f22, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+ LD $f23, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f24, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+ LD $f25, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f26, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+ LD $f27, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ ST $f10, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+ ST $f11, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f12, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+ ST $f13, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f14, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+ ST $f15, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f16, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+ ST $f17, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f20, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+ ST $f21, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f22, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+ ST $f23, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f24, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+ ST $f25, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f26, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+ ST $f27, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ subl $21, 1, $21
+ bgt $21, $SubLoop
+ .align 4
+
+$SubRemain:
+ ble $22, $SubEnd
+ .align 4
+
+$SubRemainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f20, 0*SIZE($17)
+
+ subl $22, 1, $22
+
+ ST $f10, 0*SIZE($17)
+ ST $f20, 0*SIZE($19)
+
+ SXADDQ $18, $17, $17
+ SXADDQ $20, $19, $19
+ bgt $22, $SubRemainLoop
+ .align 4
+
+$SubEnd:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/swap_simd.S b/kernel/sw_64/swap_simd.S
new file mode 100644
index 0000000..8a6141d
--- /dev/null
+++ b/kernel/sw_64/swap_simd.S
@@ -0,0 +1,327 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 64
+#define X $17
+#define Y $19
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+ mov $20, $17
+ mov $21, $18
+ ldl $19, 0($sp)
+ ldl $20, 8($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ subl $18, 1, $1
+ subl $20, 1, $2
+ ble $16, $SubEnd # if n <= 0 goto $End
+ or $1, $2, $1
+
+/*
+ Unloop 16
+*/
+ sra $16, 4, $21
+ and $16, 15, $22
+ bne $1, $Sub
+ ble $21, $MainRemain
+ .align 4
+
+/*
+ test the address of Y & X
+*/
+ and Y, (VEC_LEN*SIZE-1), $4
+ and X, (VEC_LEN*SIZE-1), $3
+ or $3, $4, $4
+ bne $4, $UnAlign_ACCESS
+
+/* align access*/
+
+$MainLoop:
+ VLD $f10, 0*VEC_LEN*SIZE(Y)
+ VLD $f11, 1*VEC_LEN*SIZE(Y)
+ VLD $f12, 2*VEC_LEN*SIZE(Y)
+ VLD $f13, 3*VEC_LEN*SIZE(Y)
+
+
+ VLD $f20, 0*VEC_LEN*SIZE(X)
+ VLD $f21, 1*VEC_LEN*SIZE(X)
+ VLD $f22, 2*VEC_LEN*SIZE(X)
+ VLD $f23, 3*VEC_LEN*SIZE(X)
+
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ unop
+ fillcs PREFETCHSIZE * SIZE(Y)
+ subl $21, 1, $21
+
+ VST $f10, 0*VEC_LEN*SIZE(X)
+ VST $f11, 1*VEC_LEN*SIZE(X)
+ VST $f12, 2*VEC_LEN*SIZE(X)
+ VST $f13, 3*VEC_LEN*SIZE(X)
+
+ VST $f20, 0*VEC_LEN*SIZE(Y)
+ VST $f21, 1*VEC_LEN*SIZE(Y)
+ VST $f22, 2*VEC_LEN*SIZE(Y)
+ VST $f23, 3*VEC_LEN*SIZE(Y)
+
+ ldi $17, 16*SIZE(X)
+ ldi $19, 16*SIZE(Y)
+ bgt $21, $MainLoop
+ .align 4
+
+$MainRemain:
+ ble $22, $MainEnd
+ .align 4
+
+$MainRemainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f20, 0*SIZE($17)
+ ldi $17, 1*SIZE($17)
+ ldi $19, 1*SIZE($19)
+ subl $22, 1, $22
+ ST $f10, -1*SIZE($17)
+ ST $f20, -1*SIZE($19)
+ bgt $22, $MainRemainLoop
+ .align 4
+
+$MainEnd:
+ clr $0
+ ret
+ .align 4
+
+$UnAlign_ACCESS:
+ sra $16, 3, $21
+ and $16, 7, $22
+ nop
+ ble $21, $UnAlign_ACCESS_MainRemain
+ .align 4
+$UnAlign_ACCESS_MainLoop:
+ LD $f10, 0*SIZE(Y)
+ LD $f11, 1*SIZE(Y)
+ LD $f12, 2*SIZE(Y)
+ LD $f13, 3*SIZE(Y)
+ LD $f14, 4*SIZE(Y)
+ LD $f15, 5*SIZE(Y)
+ LD $f16, 6*SIZE(Y)
+ LD $f17, 7*SIZE(Y)
+
+ LD $f20, 0*SIZE(X)
+ LD $f21, 1*SIZE(X)
+ LD $f22, 2*SIZE(X)
+ LD $f23, 3*SIZE(X)
+ LD $f24, 4*SIZE(X)
+ LD $f25, 5*SIZE(X)
+ LD $f26, 6*SIZE(X)
+ LD $f27, 7*SIZE(X)
+
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ unop
+ fillcs PREFETCHSIZE * SIZE(Y)
+ subl $21, 1, $21
+
+ ST $f10, 0*SIZE(X)
+ ST $f11, 1*SIZE(X)
+ ST $f12, 2*SIZE(X)
+ ST $f13, 3*SIZE(X)
+ ST $f14, 4*SIZE(X)
+ ST $f15, 5*SIZE(X)
+ ST $f16, 6*SIZE(X)
+ ST $f17, 7*SIZE(X)
+
+ ST $f20, 0*SIZE(Y)
+ ST $f21, 1*SIZE(Y)
+ ST $f22, 2*SIZE(Y)
+ ST $f23, 3*SIZE(Y)
+ ST $f24, 4*SIZE(Y)
+ ST $f25, 5*SIZE(Y)
+ ST $f26, 6*SIZE(Y)
+ ST $f27, 7*SIZE(Y)
+
+ ldi X, 8*SIZE(X)
+ ldi Y, 8*SIZE(Y)
+ bgt $21, $UnAlign_ACCESS_MainLoop
+ .align 4
+
+$UnAlign_ACCESS_MainRemain:
+ ble $22, $UnAlign_ACCESS_MainEnd
+ .align 4
+
+$UnAlign_ACCESS_MainRemainLoop:
+ LD $f10, 0*SIZE(Y)
+ LD $f20, 0*SIZE(X)
+ ldi X, 1*SIZE(X)
+ ldi Y, 1*SIZE(Y)
+ subl $22, 1, $22
+ ST $f10, -1*SIZE(X)
+ ST $f20, -1*SIZE(Y)
+ bgt $22, $UnAlign_ACCESS_MainRemainLoop
+ .align 4
+
+$UnAlign_ACCESS_MainEnd:
+ clr $0
+ ret
+ .align 4
+
+$Sub:
+ sra $16, 3, $21
+ and $16, 7, $22
+ mov $17, $23
+ mov $19, $24
+
+ ble $21, $SubRemain
+ .align 4
+
+$SubLoop:
+ LD $f10, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+ LD $f11, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f12, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+ LD $f13, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f14, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+ LD $f15, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f16, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+ LD $f17, 0*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f20, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+ LD $f21, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f22, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+ LD $f23, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f24, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+ LD $f25, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f26, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+ LD $f27, 0*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ ST $f10, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+ ST $f11, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f12, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+ ST $f13, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f14, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+ ST $f15, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f16, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+ ST $f17, 0*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f20, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+ ST $f21, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f22, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+ ST $f23, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f24, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+ ST $f25, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f26, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+ ST $f27, 0*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ subl $21, 1, $21
+ bgt $21, $SubLoop
+ .align 4
+
+$SubRemain:
+ ble $22, $SubEnd
+ .align 4
+
+$SubRemainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f20, 0*SIZE($17)
+
+ subl $22, 1, $22
+
+ ST $f10, 0*SIZE($17)
+ ST $f20, 0*SIZE($19)
+
+ SXADDQ $18, $17, $17
+ SXADDQ $20, $19, $19
+ bgt $22, $SubRemainLoop
+ .align 4
+
+$SubEnd:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S b/kernel/sw_64/trsm_kernel_4x4_LN.S
new file mode 100644
index 0000000..109c471
--- /dev/null
+++ b/kernel/sw_64/trsm_kernel_4x4_LN.S
@@ -0,0 +1,5144 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 56
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $20
+#define B $21
+#define C $22
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+#define C3 $25
+#define C4 $27
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+#define tmp $9
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl C, 0 + STACKSIZE($sp)
+ ldl LDC, 8 + STACKSIZE($sp)
+ ldl OFFSET, 16 + STACKSIZE($sp)
+
+ SXADDQ LDC, 0, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ stl tmp, 64($sp)
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ mull M, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ M, C, C
+#endif
+
+#ifdef RN
+ negq OFFSET, KK
+#endif
+
+#ifdef RT
+ mulq N, K, TMP1
+ SXADDQ TMP1, B, B
+
+ mulq N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ sra N, 2, J
+ ble J, $L40
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ s4addl LDC, 0, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C3
+#ifndef RT
+ s4addl LDC, C, C
+#endif
+
+ fclr t1
+ addl C3, LDC, C4
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ fclr t3
+ fclr t4
+
+ and M, 1, I
+ ble I, $L20
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c09
+ LD b4, 3 * SIZE(B)
+ fclr c13
+
+ ldi BO, 4 * SIZE(B)
+ ble KK, $L38
+
+ ble L, $L35
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c09
+ LD b4, 3 * SIZE(BO)
+ fclr c13
+
+ ldi BO, 4 * SIZE(BO)
+ ble TMP1, $L38
+
+ ble L, $L35
+#endif
+ .align 4
+
+$L32:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ ldi AO, 2 * SIZE(AO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ LD b5, 3 * SIZE(BO)
+ FIMOVD b5, tmp
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL a1, b4, t4
+ LD a1, -1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a2, b1, t1
+ LD b1, 4 * SIZE(BO)
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a2, b2, t2
+ LD b2, -3 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ LD b4, -1 * SIZE(BO)
+ MUL a2, b3, t3
+ LD b3, -2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ IFMOVD tmp, b5
+ MUL a2, b5, t4
+ LD a2, 0 * SIZE(AO)
+ bgt L, $L32
+ .align 4
+
+$L35:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L37
+#else
+ blbs TMP1, $L37
+#endif
+ .align 4
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ LD b1, 0 * SIZE(BO)
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ MUL a1, b3, b5
+ fmov b5, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL a1, b4, b5
+ fmov b5, t4
+ LD a1, 0 * SIZE(AO)
+ ldi AO, 1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L37:
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ ADD c09, t3, b5
+ fmov b5, c09
+ MUL a1, b3, b5
+ fmov b5, t3
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b4, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c05, t2, b5
+ fmov b5, c05
+ ADD c09, t3, b5
+ fmov b5, c09
+ ADD c13, t4, b5
+ fmov b5, c13
+
+$L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a3, c01, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL a4, c01, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b2, c05, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL b3, c05, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a2, c09, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a2, c13, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a4, c13, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b2, c09, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL b3, c09, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a2, c05, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c13, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+ ldi C3, -1 * SIZE(C3)
+ ldi C4, -1 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c09, 0 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L20:
+ and M, 2, I
+ ble I, $L30
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c01
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ ldi BO, 4 * SIZE(B)
+ fclr c02
+ fclr c06
+ ble KK, $L28
+
+ ble L, $L25
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c01
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c02
+ fclr c06
+ ble TMP1, $L28
+
+ ble L, $L25
+#endif
+ .align 4
+
+$L22:
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+ FIMOVD b5, tmp
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ IFMOVD tmp, b5
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ IFMOVD tmp, b5
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD c09, t1, b5
+ fmov b5, c09
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, b5
+ fmov b5, t3
+ unop
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, b5
+ fmov b5, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b4, b5
+ fmov b5, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD c10, t2, b5
+ fmov b5, c10
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c13, t3, b5
+ fmov b5, c13
+ MUL a1, b2, b5
+ fmov b5, t3
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ MUL a2, b2, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, b5
+ fmov b5, t2
+ ADD c05, t3, b5
+ fmov b5, c05
+ MUL a1, b4, b5
+ fmov b5, t3
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ ADD c10, t2, b5
+ fmov b5, c10
+ ADD c13, t3, b5
+ fmov b5, c13
+ ADD c14, t4, b5
+ fmov b5, c14
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+
+ SUB b1, c02, b5
+ fmov b5, c02
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c10, b5
+ fmov b5, c10
+ SUB b4, c14, b5
+ fmov b5, c14
+
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c05, b5
+ fmov b5, c05
+ SUB a4, c06, b5
+ fmov b5, c06
+
+ SUB b1, c09, b5
+ fmov b5, c09
+ SUB b2, c10, b5
+ fmov b5, c10
+ SUB b3, c13, b5
+ fmov b5, c13
+ SUB b4, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c10, b5
+ fmov b5, t3
+ MUL a2, c14, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c09, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+ MUL a2, c09, b5
+ fmov b5, t3
+ MUL a2, c13, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c06, b5
+ fmov b5, c06
+ MUL a3, c10, b5
+ fmov b5, c10
+ MUL a3, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c02, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c02, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b1, c06, b5
+ fmov b5, c06
+
+ MUL b2, c05, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL b3, c05, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ MUL a2, c09, b5
+ fmov b5, t1
+ MUL a2, c10, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ MUL a3, c13, b5
+ fmov b5, c13
+ MUL a3, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c13, b5
+ fmov b5, t1
+ MUL a2, c14, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a3, c13, b5
+ fmov b5, t1
+ MUL a3, c14, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a4, c13, b5
+ fmov b5, t1
+ MUL a4, c14, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b1, c10, b5
+ fmov b5, c10
+
+ MUL b2, c09, b5
+ fmov b5, t1
+ MUL b2, c10, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL b3, c09, b5
+ fmov b5, t1
+ MUL b3, c10, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c13, 6 * SIZE(AO)
+ ST c14, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+ ldi C3, -2 * SIZE(C3)
+ ldi C4, -2 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+ ldi C3, 2 * SIZE(C3)
+ ldi C4, 2 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L30:
+ sra M, 2, I
+ ble I, $L39
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+
+ LD b3, 2 * SIZE(B)
+ fclr c06
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(KK)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(B)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble KK, $L18
+#else
+
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+
+ LD b3, 2 * SIZE(BO)
+ fclr c06
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(TMP1)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(BO)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble TMP1, $L18
+#endif
+
+ ble L, $L15
+ .align 5
+
+$L12:
+/* 1 */
+ ADD c11, t1, b5
+ fmov b5, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+ FIMOVD b5, tmp
+
+/* 2 */
+ ADD c01, t1, b5
+ fmov b5, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD c11, t1, b5
+ fmov b5, c11
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ ldi L, -2(L)
+ IFMOVD tmp, b5
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a6, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a4, t2
+ unop
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD c03, t1, b5
+ fmov b5, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD c11, t1, b5
+ fmov b5, c11
+ MUL b1, a1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, b5
+ fmov b5, t2
+ ADD c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, b5
+ fmov b5, t3
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL b1, a4, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, b5
+ fmov b5, t3
+ ADD c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, b5
+ fmov b5, t4
+
+ ADD c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, b5
+ fmov b5, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, b5
+ fmov b5, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, b5
+ fmov b5, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, b5
+ fmov b5, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, b5
+ fmov b5, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, b5
+ fmov b5, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD c11, t1, b5
+ fmov b5, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, b5
+ fmov b5, t2
+ ADD c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, b5
+ fmov b5, t3
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL b1, a4, b5
+ fmov b5, t2
+ ADD c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, b5
+ fmov b5, t3
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, b5
+ fmov b5, t4
+ ADD c03, t1, b5
+ fmov b5, c03
+ MUL b3, a1, b5
+ fmov b5, t1
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ MUL b3, a2, b5
+ fmov b5, t2
+ ADD c08, t3, b5
+ fmov b5, c08
+ MUL b4, a2, b5
+ fmov b5, t3
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL b2, a3, b5
+ fmov b5, t4
+ ADD c09, t1, b5
+ fmov b5, c09
+ MUL b3, a3, b5
+ fmov b5, t1
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ MUL b3, a4, b5
+ fmov b5, t2
+ ADD c14, t3, b5
+ fmov b5, c14
+ MUL b4, a4, b5
+ fmov b5, t3
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c11, t1, b5
+ fmov b5, c11
+ ADD c12, t2, b5
+ fmov b5, c12
+ ADD c16, t3, b5
+ fmov b5, c16
+ ADD c15, t4, b5
+ fmov b5, c15
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+
+ SUB b1, c02, b5
+ fmov b5, c02
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c10, b5
+ fmov b5, c10
+ SUB b4, c14, b5
+ fmov b5, c14
+
+ LD a1, 8 * SIZE(BO)
+ LD a2, 9 * SIZE(BO)
+ LD a3, 10 * SIZE(BO)
+ LD a4, 11 * SIZE(BO)
+
+ LD b1, 12 * SIZE(BO)
+ LD b2, 13 * SIZE(BO)
+ LD b3, 14 * SIZE(BO)
+ LD b4, 15 * SIZE(BO)
+
+ SUB a1, c03, b5
+ fmov b5, c03
+ SUB a2, c07, b5
+ fmov b5, c07
+ SUB a3, c11, b5
+ fmov b5, c11
+ SUB a4, c15, b5
+ fmov b5, c15
+
+ SUB b1, c04, b5
+ fmov b5, c04
+ SUB b2, c08, b5
+ fmov b5, c08
+ SUB b3, c12, b5
+ fmov b5, c12
+ SUB b4, c16, b5
+ fmov b5, c16
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c05, b5
+ fmov b5, c05
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c07, b5
+ fmov b5, c07
+ SUB b4, c08, b5
+ fmov b5, c08
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 12 * SIZE(AO)
+ LD b2, 13 * SIZE(AO)
+ LD b3, 14 * SIZE(AO)
+ LD b4, 15 * SIZE(AO)
+
+ SUB a1, c09, b5
+ fmov b5, c09
+ SUB a2, c10, b5
+ fmov b5, c10
+ SUB a3, c11, b5
+ fmov b5, c11
+ SUB a4, c12, b5
+ fmov b5, c12
+
+ SUB b1, c13, b5
+ fmov b5, c13
+ SUB b2, c14, b5
+ fmov b5, c14
+ SUB b3, c15, b5
+ fmov b5, c15
+ SUB b4, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c08, b5
+ fmov b5, c08
+ MUL a1, c12, b5
+ fmov b5, c12
+ MUL a1, c16, b5
+ fmov b5, c16
+
+ MUL a2, c04, b5
+ fmov b5, t1
+ MUL a2, c08, b5
+ fmov b5, t2
+ MUL a2, c12, b5
+ fmov b5, t3
+ MUL a2, c16, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL a3, c04, b5
+ fmov b5, t1
+ MUL a3, c08, b5
+ fmov b5, t2
+ MUL a3, c12, b5
+ fmov b5, t3
+ MUL a3, c16, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a4, c04, b5
+ fmov b5, t1
+ MUL a4, c08, b5
+ fmov b5, t2
+ MUL a4, c12, b5
+ fmov b5, t3
+ MUL a4, c16, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b1, c07, b5
+ fmov b5, c07
+ MUL b1, c11, b5
+ fmov b5, c11
+ MUL b1, c15, b5
+ fmov b5, c15
+
+ MUL b2, c03, b5
+ fmov b5, t1
+ MUL b2, c07, b5
+ fmov b5, t2
+ MUL b2, c11, b5
+ fmov b5, t3
+ MUL b2, c15, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL b3, c03, b5
+ fmov b5, t1
+ MUL b3, c07, b5
+ fmov b5, t2
+ MUL b3, c11, b5
+ fmov b5, t3
+ MUL b3, c15, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c10, b5
+ fmov b5, t3
+ MUL a2, c14, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c09, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+ MUL a2, c09, b5
+ fmov b5, t3
+ MUL a2, c13, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c05, b5
+ fmov b5, t2
+ MUL a3, c09, b5
+ fmov b5, t3
+ MUL a3, c13, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c05, b5
+ fmov b5, t2
+ MUL a4, c09, b5
+ fmov b5, t3
+ MUL a4, c13, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b1, c06, b5
+ fmov b5, c06
+ MUL b1, c10, b5
+ fmov b5, c10
+ MUL b1, c14, b5
+ fmov b5, c14
+
+ MUL b2, c02, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+ MUL b2, c10, b5
+ fmov b5, t3
+ MUL b2, c14, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL b3, c02, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+ MUL b3, c10, b5
+ fmov b5, t3
+ MUL b3, c14, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c15, b5
+ fmov b5, c15
+
+ MUL a2, c03, b5
+ fmov b5, t1
+ MUL a2, c07, b5
+ fmov b5, t2
+ MUL a2, c11, b5
+ fmov b5, t3
+ MUL a2, c15, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ MUL a3, c04, b5
+ fmov b5, c04
+ MUL a3, c08, b5
+ fmov b5, c08
+ MUL a3, c12, b5
+ fmov b5, c12
+ MUL a3, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+ MUL a2, c03, b5
+ fmov b5, t3
+ MUL a2, c04, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c02, b5
+ fmov b5, t2
+ MUL a3, c03, b5
+ fmov b5, t3
+ MUL a3, c04, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c02, b5
+ fmov b5, t2
+ MUL a4, c03, b5
+ fmov b5, t3
+ MUL a4, c04, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b1, c06, b5
+ fmov b5, c06
+ MUL b1, c07, b5
+ fmov b5, c07
+ MUL b1, c08, b5
+ fmov b5, c08
+
+ MUL b2, c05, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+ MUL b2, c07, b5
+ fmov b5, t3
+ MUL b2, c08, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL b3, c05, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+ MUL b3, c07, b5
+ fmov b5, t3
+ MUL b3, c08, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ MUL a2, c09, b5
+ fmov b5, t1
+ MUL a2, c10, b5
+ fmov b5, t2
+ MUL a2, c11, b5
+ fmov b5, t3
+ MUL a2, c12, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ MUL a3, c13, b5
+ fmov b5, c13
+ MUL a3, c14, b5
+ fmov b5, c14
+ MUL a3, c15, b5
+ fmov b5, c15
+ MUL a3, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a1, c14, b5
+ fmov b5, c14
+ MUL a1, c15, b5
+ fmov b5, c15
+ MUL a1, c16, b5
+ fmov b5, c16
+
+ MUL a2, c13, b5
+ fmov b5, t1
+ MUL a2, c14, b5
+ fmov b5, t2
+ MUL a2, c15, b5
+ fmov b5, t3
+ MUL a2, c16, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c13, b5
+ fmov b5, t1
+ MUL a3, c14, b5
+ fmov b5, t2
+ MUL a3, c15, b5
+ fmov b5, t3
+ MUL a3, c16, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a4, c13, b5
+ fmov b5, t1
+ MUL a4, c14, b5
+ fmov b5, t2
+ MUL a4, c15, b5
+ fmov b5, t3
+ MUL a4, c16, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b1, c10, b5
+ fmov b5, c10
+ MUL b1, c11, b5
+ fmov b5, c11
+ MUL b1, c12, b5
+ fmov b5, c12
+
+ MUL b2, c09, b5
+ fmov b5, t1
+ MUL b2, c10, b5
+ fmov b5, t2
+ MUL b2, c11, b5
+ fmov b5, t3
+ MUL b2, c12, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL b3, c09, b5
+ fmov b5, t1
+ MUL b3, c10, b5
+ fmov b5, t2
+ MUL b3, c11, b5
+ fmov b5, t3
+ MUL b3, c12, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c07, b5
+ fmov b5, t3
+ MUL a2, c08, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c03, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+
+ ST c03, 8 * SIZE(BO)
+ ST c07, 9 * SIZE(BO)
+ ST c11, 10 * SIZE(BO)
+ ST c15, 11 * SIZE(BO)
+
+ ST c04, 12 * SIZE(BO)
+ ST c08, 13 * SIZE(BO)
+ ST c12, 14 * SIZE(BO)
+ ST c16, 15 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+
+ ST c09, 8 * SIZE(AO)
+ ST c10, 9 * SIZE(AO)
+ ST c11, 10 * SIZE(AO)
+ ST c12, 11 * SIZE(AO)
+
+ ST c13, 12 * SIZE(AO)
+ ST c14, 13 * SIZE(AO)
+ ST c15, 14 * SIZE(AO)
+ ST c16, 15 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+ ldi C3, -4 * SIZE(C3)
+ ldi C4, -4 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c11, 2 * SIZE(C3)
+ ST c12, 3 * SIZE(C3)
+
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+ ST c15, 2 * SIZE(C4)
+ ST c16, 3 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+ ldi C3, 4 * SIZE(C3)
+ ldi C4, 4 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L11
+ .align 4
+
+$L39:
+#ifdef LN
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 4, KK
+#endif
+
+#ifdef RT
+ subl KK, 4, KK
+#endif
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L40:
+ and N, 2, J
+ ble J, $L80
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ addl LDC, LDC, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ fclr t1
+#ifndef RT
+ addl C2, LDC, C
+#endif
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ fclr t3
+ fclr t4
+
+ and M, 1, I
+ ble I, $L60
+
+#if defined(LT) || defined(RN)
+
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ fclr c02
+ LD b2, 1 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L78
+
+ ble L, $L75
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ fclr c02
+ LD b2, 1 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L78
+
+ ble L, $L75
+#endif
+ .align 4
+
+$L72:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD a1, 1 * SIZE(AO)
+ LD b2, 3 * SIZE(BO)
+
+ ADD c02, t3, b5
+ fmov b5, c02
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b3, b5
+ fmov b5, t3
+ LD b3, 4 * SIZE(BO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, b5
+ fmov b5, t4
+ LD a2, 0 * SIZE(AO)
+ LD b4, 5 * SIZE(BO)
+
+ ldi BO, 4 * SIZE(BO)
+ unop
+ unop
+ bgt L, $L72
+ .align 4
+
+$L75:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L77
+#else
+ blbs TMP1, $L77
+#endif
+ .align 4
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L77:
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ ADD c02, t3, b5
+ fmov b5, c02
+ ADD c06, t4, b5
+ fmov b5, c06
+
+ ADD c01, c02, b5
+ fmov b5, c01
+ ldi AO, 1 * SIZE(AO)
+ ADD c05, c06, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c05, t2, b5
+ fmov b5, c05
+
+ .align 4
+
+$L78:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a2, c05, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L60:
+ and M, 2, I
+ ble I, $L70
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L68
+
+ ble L, $L65
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L68
+
+ ble L, $L65
+#endif
+ .align 4
+
+$L62:
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ ldi L, -2(L)
+ MUL a1, b2, b5
+ fmov b5, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, b5
+ fmov b5, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a4, b4, b5
+ fmov b5, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L62
+ .align 4
+
+$L65:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L67
+#else
+ blbs TMP1, $L67
+#endif
+ .align 4
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, b5
+ fmov b5, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L67:
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c05, t3, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t3
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, b5
+ fmov b5, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c05, t3, b5
+ fmov b5, c05
+ ADD c06, t4, b5
+ fmov b5, c06
+ .align 4
+
+$L68:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c02, b5
+ fmov b5, c02
+ SUB a4, c06, b5
+ fmov b5, c06
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c05, b5
+ fmov b5, c05
+ SUB a4, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L70:
+ sra M, 2, I
+ ble I, $L79
+ .align 4
+
+$L51:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ ldi BO, 2 * SIZE(B)
+ ldi AO, 4 * SIZE(AO)
+
+ ble KK, $L58
+
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+ ldi BO, 2 * SIZE(BO)
+ ldi AO, 4 * SIZE(AO)
+
+ ble TMP1, $L58
+
+ ble L, $L55
+#endif
+ .align 4
+
+$L52:
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ unop
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ ldi L, -2(L)
+ MUL a2, b1, b5
+ fmov b5, t2
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ unop
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ unop
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b3, b5
+ fmov b5, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a5, b3, b5
+ fmov b5, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b4, b5
+ fmov b5, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b4, b5
+ fmov b5, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD c05, t1, b5
+ fmov b5, c05
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, b5
+ fmov b5, t3
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L57:
+ ADD c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, b5
+ fmov b5, t3
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ MUL a4, b1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b2, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, b5
+ fmov b5, t2
+ ADD c03, t3, b5
+ fmov b5, c03
+ MUL a3, b2, b5
+ fmov b5, t3
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, b5
+ fmov b5, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ ADD c06, t2, b5
+ fmov b5, c06
+ ADD c07, t3, b5
+ fmov b5, c07
+ ADD c08, t4, b5
+ fmov b5, c08
+ .align 4
+
+$L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c02, b5
+ fmov b5, c02
+ SUB a4, c06, b5
+ fmov b5, c06
+
+ SUB b1, c03, b5
+ fmov b5, c03
+ SUB b2, c07, b5
+ fmov b5, c07
+ SUB b3, c04, b5
+ fmov b5, c04
+ SUB b4, c08, b5
+ fmov b5, c08
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c05, b5
+ fmov b5, c05
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c07, b5
+ fmov b5, c07
+ SUB b4, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c04, b5
+ fmov b5, t1
+ MUL a2, c08, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL a3, c04, b5
+ fmov b5, t1
+ MUL a3, c08, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a4, c04, b5
+ fmov b5, t1
+ MUL a4, c08, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b1, c07, b5
+ fmov b5, c07
+
+ MUL b2, c03, b5
+ fmov b5, t1
+ MUL b2, c07, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL b3, c03, b5
+ fmov b5, t1
+ MUL b3, c07, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c05, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c05, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b1, c06, b5
+ fmov b5, c06
+
+ MUL b2, c02, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL b3, c02, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c07, b5
+ fmov b5, c07
+
+ MUL a2, c03, b5
+ fmov b5, t1
+ MUL a2, c07, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ MUL a3, c04, b5
+ fmov b5, c04
+ MUL a3, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+ MUL a2, c03, b5
+ fmov b5, t3
+ MUL a2, c04, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c06, b5
+ fmov b5, c06
+ MUL a3, c07, b5
+ fmov b5, c07
+ MUL a3, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c07, b5
+ fmov b5, t3
+ MUL a2, c08, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c03, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c07, 5 * SIZE(BO)
+ ST c04, 6 * SIZE(BO)
+ ST c08, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L51
+ .align 4
+
+$L79:
+#ifdef LN
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L80:
+ and N, 1, J
+ ble J, $L999
+
+#ifdef RT
+ sll K, BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C
+#endif
+
+ mov C, C1
+#ifndef RT
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ and M, 1, I
+ ble I, $L100
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ unop
+ ble L, $L115
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L115
+#endif
+ .align 4
+
+$L112:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ MUL a3, b3, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b4, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b4, 7 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 4 * SIZE(AO)
+ ldi BO, 4 * SIZE(BO)
+ bgt L, $L112
+ .align 4
+
+$L115:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L118
+ .align 4
+
+$L116:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 1 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 1 * SIZE(AO)
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L116
+ .align 4
+
+$L118:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+ ADD c01, c02, b5
+ fmov b5, c01
+ ADD c03, c04, b5
+ fmov b5, c03
+ ADD c01, c03, b5
+ fmov b5, c01
+
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+#else
+ LD a1, 0 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 1 * SIZE(C1)
+#endif
+
+#ifdef RT
+ SXADDQ K, AORIG, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L100:
+ and M, 2, I
+ ble I, $L110
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L105
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ ble L, $L105
+#endif
+ .align 5
+
+$L102:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ ldi BO, 4 * SIZE(BO)
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a5, 7 * SIZE(AO)
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD a1, 8 * SIZE(AO)
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L102
+ .align 4
+
+$L105:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L108
+ .align 4
+
+$L106:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 2 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 3 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi AO, 2 * SIZE(AO)
+ unop
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L106
+ .align 4
+
+$L108:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+ ADD c01, c03, b5
+ fmov b5, c01
+ ADD c02, c04, b5
+ fmov b5, c02
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a2, c02, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L110:
+ sra M, 2, I
+ ble I, $L119
+ .align 4
+
+$L91:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L95
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L95
+#endif
+ .align 5
+
+$L92:
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi L, -1(L)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 8 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 9 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 10 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a4, 11 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD a1, 12 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD a2, 13 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b3, b5
+ fmov b5, t3
+ LD a3, 14 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b3, b5
+ fmov b5, t4
+ LD a5, 15 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b4, b5
+ fmov b5, t1
+ LD a1, 16 * SIZE(AO)
+ ldi AO, 16 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b4, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L92
+ .align 4
+
+$L95:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ unop
+ ble L, $L98
+ .align 4
+
+$L96:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 1 * SIZE(BO)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ldi AO, 4 * SIZE(AO)
+ bgt L, $L96
+ .align 4
+
+$L98:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a2, c04, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a4, c04, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b2, c03, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL b3, c03, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a2, c02, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a3, c01, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL a4, c01, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b2, c02, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL b3, c02, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a2, c03, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L91
+ .align 4
+
+$L119:
+#ifdef LN
+ SXADDQ K, B, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl tmp, 64($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/trsm_kernel_4x4_LN.S.bak b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak
new file mode 100644
index 0000000..8405570
--- /dev/null
+++ b/kernel/sw_64/trsm_kernel_4x4_LN.S.bak
@@ -0,0 +1,4073 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW2B
+#define PREFETCHSIZE 56
+#define UNOP nop
+#endif
+
+#ifdef EV6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 56
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $20
+#define B $21
+#define C $22
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+#define C3 $25
+#define C4 $27
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl C, 0 + STACKSIZE($sp)
+ ldl LDC, 8 + STACKSIZE($sp)
+ ldl OFFSET, 16 + STACKSIZE($sp)
+
+ SXADDQ LDC, 0, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ mull M, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ M, C, C
+#endif
+
+#ifdef RN
+ negq OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ sra N, 2, J
+ ble J, $L40
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ s4addl LDC, 0, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C3
+#ifndef RT
+ s4addl LDC, C, C
+#endif
+
+ fclr t1
+ addl C3, LDC, C4
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ fclr t3
+ fclr t4
+
+ and M, 1, I
+ ble I, $L20
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c09
+ LD b4, 3 * SIZE(B)
+ fclr c13
+
+ ldi BO, 4 * SIZE(B)
+ ble KK, $L38
+
+ ble L, $L35
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c09
+ LD b4, 3 * SIZE(BO)
+ fclr c13
+
+ ldi BO, 4 * SIZE(BO)
+ ble TMP1, $L38
+
+ ble L, $L35
+#endif
+ .align 4
+
+$L32:
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t2, c05
+ ldi AO, 2 * SIZE(AO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, c09
+ LD b5, 3 * SIZE(BO)
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a1, b4, t4
+ LD a1, -1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ MUL a2, b1, t1
+ LD b1, 4 * SIZE(BO)
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c05, t2, c05
+ MUL a2, b2, t2
+ LD b2, -3 * SIZE(BO)
+
+ ADD c09, t3, c09
+ LD b4, -1 * SIZE(BO)
+ MUL a2, b3, t3
+ LD b3, -2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a2, b5, t4
+ LD a2, 0 * SIZE(AO)
+ bgt L, $L32
+ .align 4
+
+$L35:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L37
+#else
+ blbs TMP1, $L37
+#endif
+ .align 4
+
+ ADD c05, t2, c05
+ LD b1, 0 * SIZE(BO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, c09
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a1, b4, t4
+ LD a1, 0 * SIZE(AO)
+ ldi AO, 1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L37:
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ ADD c09, t3, c09
+ MUL a1, b3, t3
+
+ ADD c13, t4, c13
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c05, t2, c05
+ ADD c09, t3, c09
+ ADD c13, t4, c13
+
+$L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c05, t1, c05
+ MUL a3, c01, t1
+ SUB c09, t1, c09
+ MUL a4, c01, t1
+ SUB c13, t1, c13
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b2, c05, t1
+ SUB c09, t1, c09
+ MUL b3, c05, t1
+ SUB c13, t1, c13
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a2, c09, t1
+ SUB c13, t1, c13
+ MUL a3, c13, c13
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a2, c13, t1
+ SUB c09, t1, c09
+ MUL a3, c13, t1
+ SUB c05, t1, c05
+ MUL a4, c13, t1
+ SUB c01, t1, c01
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b2, c09, t1
+ SUB c05, t1, c05
+ MUL b3, c09, t1
+ SUB c01, t1, c01
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a2, c05, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c13, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+ ldi C3, -1 * SIZE(C3)
+ ldi C4, -1 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c09, 0 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L20:
+ and M, 2, I
+ ble I, $L30
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c01
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ ldi BO, 4 * SIZE(B)
+ fclr c02
+ fclr c06
+ ble KK, $L28
+
+ ble L, $L25
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c01
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c02
+ fclr c06
+ ble TMP1, $L28
+
+ ble L, $L25
+#endif
+ .align 4
+
+$L22:
+ ADD c09, t1, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD c09, t1, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c14, t4, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD c09, t1, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c09, t1, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD c10, t2, c10
+ MUL a2, b1, t2
+ ADD c13, t3, c13
+ MUL a1, b2, t3
+
+ ADD c14, t4, c14
+ MUL a2, b2, t4
+ ADD c01, t1, c01
+ MUL a1, b3, t1
+
+ ADD c02, t2, c02
+ MUL a2, b3, t2
+ ADD c05, t3, c05
+ MUL a1, b4, t3
+
+ ADD c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c09, t1, c09
+ ADD c10, t2, c10
+ ADD c13, t3, c13
+ ADD c14, t4, c14
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+
+ SUB b1, c02, c02
+ SUB b2, c06, c06
+ SUB b3, c10, c10
+ SUB b4, c14, c14
+
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c05, c05
+ SUB a4, c06, c06
+
+ SUB b1, c09, c09
+ SUB b2, c10, c10
+ SUB b3, c13, c13
+ SUB b4, c14, c14
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+ MUL a1, c10, c10
+ MUL a1, c14, c14
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+ MUL a2, c10, t3
+ MUL a2, c14, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+ MUL a3, c09, c09
+ MUL a3, c13, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+ MUL a2, c09, t3
+ MUL a2, c13, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a3, c02, c02
+ MUL a3, c06, c06
+ MUL a3, c10, c10
+ MUL a3, c14, c14
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a4, c01, t1
+ MUL a4, c02, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b1, c06, c06
+
+ MUL b2, c05, t1
+ MUL b2, c06, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL b3, c05, t1
+ MUL b3, c06, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ MUL a2, c09, t1
+ MUL a2, c10, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ MUL a3, c13, c13
+ MUL a3, c14, c14
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a1, c14, c14
+
+ MUL a2, c13, t1
+ MUL a2, c14, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a3, c13, t1
+ MUL a3, c14, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a4, c13, t1
+ MUL a4, c14, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b1, c10, c10
+
+ MUL b2, c09, t1
+ MUL b2, c10, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL b3, c09, t1
+ MUL b3, c10, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c13, 6 * SIZE(AO)
+ ST c14, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+ ldi C3, -2 * SIZE(C3)
+ ldi C4, -2 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+ ldi C3, 2 * SIZE(C3)
+ ldi C4, 2 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L30:
+ sra M, 2, I
+ ble I, $L39
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+
+ LD b3, 2 * SIZE(B)
+ fclr c06
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(KK)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(B)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble KK, $L18
+#else
+
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+
+ LD b3, 2 * SIZE(BO)
+ fclr c06
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(TMP1)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(BO)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble TMP1, $L18
+#endif
+
+ ble L, $L15
+ .align 5
+
+$L12:
+/* 1 */
+ ADD c11, t1, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD c01, t1, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD c02, t2, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD c11, t1, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c12, t2, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD c01, t1, c01
+ unop
+ MUL b5, a6, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD c03, t1, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD c04, t2, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD c09, t1, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD c11, t1, c11
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD c12, t2, c12
+ MUL b1, a2, t2
+ ADD c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD c15, t4, c15
+ MUL b2, a1, t4
+ ADD c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD c02, t2, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c06, t3, c06
+ MUL b2, a4, t3
+ ADD c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD c11, t1, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD c12, t2, c12
+ MUL b1, a2, t2
+ ADD c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD c15, t4, c15
+ MUL b2, a1, t4
+ ADD c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD c02, t2, c02
+ MUL b1, a4, t2
+ ADD c06, t3, c06
+ MUL b2, a4, t3
+
+ ADD c05, t4, c05
+ MUL b4, a1, t4
+ ADD c03, t1, c03
+ MUL b3, a1, t1
+
+ ADD c04, t2, c04
+ MUL b3, a2, t2
+ ADD c08, t3, c08
+ MUL b4, a2, t3
+
+ ADD c13, t4, c13
+ MUL b2, a3, t4
+ ADD c09, t1, c09
+ MUL b3, a3, t1
+
+ ADD c10, t2, c10
+ MUL b3, a4, t2
+ ADD c14, t3, c14
+ MUL b4, a4, t3
+
+ ADD c07, t4, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c11, t1, c11
+ ADD c12, t2, c12
+ ADD c16, t3, c16
+ ADD c15, t4, c15
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+
+ SUB b1, c02, c02
+ SUB b2, c06, c06
+ SUB b3, c10, c10
+ SUB b4, c14, c14
+
+ LD a1, 8 * SIZE(BO)
+ LD a2, 9 * SIZE(BO)
+ LD a3, 10 * SIZE(BO)
+ LD a4, 11 * SIZE(BO)
+
+ LD b1, 12 * SIZE(BO)
+ LD b2, 13 * SIZE(BO)
+ LD b3, 14 * SIZE(BO)
+ LD b4, 15 * SIZE(BO)
+
+ SUB a1, c03, c03
+ SUB a2, c07, c07
+ SUB a3, c11, c11
+ SUB a4, c15, c15
+
+ SUB b1, c04, c04
+ SUB b2, c08, c08
+ SUB b3, c12, c12
+ SUB b4, c16, c16
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c05, c05
+ SUB b2, c06, c06
+ SUB b3, c07, c07
+ SUB b4, c08, c08
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 12 * SIZE(AO)
+ LD b2, 13 * SIZE(AO)
+ LD b3, 14 * SIZE(AO)
+ LD b4, 15 * SIZE(AO)
+
+ SUB a1, c09, c09
+ SUB a2, c10, c10
+ SUB a3, c11, c11
+ SUB a4, c12, c12
+
+ SUB b1, c13, c13
+ SUB b2, c14, c14
+ SUB b3, c15, c15
+ SUB b4, c16, c16
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a1, c08, c08
+ MUL a1, c12, c12
+ MUL a1, c16, c16
+
+ MUL a2, c04, t1
+ MUL a2, c08, t2
+ MUL a2, c12, t3
+ MUL a2, c16, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL a3, c04, t1
+ MUL a3, c08, t2
+ MUL a3, c12, t3
+ MUL a3, c16, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a4, c04, t1
+ MUL a4, c08, t2
+ MUL a4, c12, t3
+ MUL a4, c16, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b1, c07, c07
+ MUL b1, c11, c11
+ MUL b1, c15, c15
+
+ MUL b2, c03, t1
+ MUL b2, c07, t2
+ MUL b2, c11, t3
+ MUL b2, c15, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL b3, c03, t1
+ MUL b3, c07, t2
+ MUL b3, c11, t3
+ MUL b3, c15, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+ MUL a1, c10, c10
+ MUL a1, c14, c14
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+ MUL a2, c10, t3
+ MUL a2, c14, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+ MUL a3, c09, c09
+ MUL a3, c13, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+ MUL a2, c09, t3
+ MUL a2, c13, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a3, c01, t1
+ MUL a3, c05, t2
+ MUL a3, c09, t3
+ MUL a3, c13, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL a4, c01, t1
+ MUL a4, c05, t2
+ MUL a4, c09, t3
+ MUL a4, c13, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b1, c06, c06
+ MUL b1, c10, c10
+ MUL b1, c14, c14
+
+ MUL b2, c02, t1
+ MUL b2, c06, t2
+ MUL b2, c10, t3
+ MUL b2, c14, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL b3, c02, t1
+ MUL b3, c06, t2
+ MUL b3, c10, t3
+ MUL b3, c14, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a1, c07, c07
+ MUL a1, c11, c11
+ MUL a1, c15, c15
+
+ MUL a2, c03, t1
+ MUL a2, c07, t2
+ MUL a2, c11, t3
+ MUL a2, c15, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ MUL a3, c04, c04
+ MUL a3, c08, c08
+ MUL a3, c12, c12
+ MUL a3, c16, c16
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+ MUL a2, c03, t3
+ MUL a2, c04, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c01, t1
+ MUL a4, c02, t2
+ MUL a4, c03, t3
+ MUL a4, c04, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b1, c06, c06
+ MUL b1, c07, c07
+ MUL b1, c08, c08
+
+ MUL b2, c05, t1
+ MUL b2, c06, t2
+ MUL b2, c07, t3
+ MUL b2, c08, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL b3, c05, t1
+ MUL b3, c06, t2
+ MUL b3, c07, t3
+ MUL b3, c08, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ MUL a2, c09, t1
+ MUL a2, c10, t2
+ MUL a2, c11, t3
+ MUL a2, c12, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ MUL a3, c13, c13
+ MUL a3, c14, c14
+ MUL a3, c15, c15
+ MUL a3, c16, c16
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a1, c14, c14
+ MUL a1, c15, c15
+ MUL a1, c16, c16
+
+ MUL a2, c13, t1
+ MUL a2, c14, t2
+ MUL a2, c15, t3
+ MUL a2, c16, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a3, c13, t1
+ MUL a3, c14, t2
+ MUL a3, c15, t3
+ MUL a3, c16, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a4, c13, t1
+ MUL a4, c14, t2
+ MUL a4, c15, t3
+ MUL a4, c16, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b1, c10, c10
+ MUL b1, c11, c11
+ MUL b1, c12, c12
+
+ MUL b2, c09, t1
+ MUL b2, c10, t2
+ MUL b2, c11, t3
+ MUL b2, c12, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL b3, c09, t1
+ MUL b3, c10, t2
+ MUL b3, c11, t3
+ MUL b3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+ MUL a1, c07, c07
+ MUL a1, c08, c08
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+ MUL a2, c07, t3
+ MUL a2, c08, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+ MUL a3, c03, c03
+ MUL a3, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+
+ ST c03, 8 * SIZE(BO)
+ ST c07, 9 * SIZE(BO)
+ ST c11, 10 * SIZE(BO)
+ ST c15, 11 * SIZE(BO)
+
+ ST c04, 12 * SIZE(BO)
+ ST c08, 13 * SIZE(BO)
+ ST c12, 14 * SIZE(BO)
+ ST c16, 15 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+
+ ST c09, 8 * SIZE(AO)
+ ST c10, 9 * SIZE(AO)
+ ST c11, 10 * SIZE(AO)
+ ST c12, 11 * SIZE(AO)
+
+ ST c13, 12 * SIZE(AO)
+ ST c14, 13 * SIZE(AO)
+ ST c15, 14 * SIZE(AO)
+ ST c16, 15 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+ ldi C3, -4 * SIZE(C3)
+ ldi C4, -4 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c11, 2 * SIZE(C3)
+ ST c12, 3 * SIZE(C3)
+
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+ ST c15, 2 * SIZE(C4)
+ ST c16, 3 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+ ldi C3, 4 * SIZE(C3)
+ ldi C4, 4 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L11
+ .align 4
+
+$L39:
+#ifdef LN
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 4, KK
+#endif
+
+#ifdef RT
+ subl KK, 4, KK
+#endif
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L40:
+ and N, 2, J
+ ble J, $L80
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ addl LDC, LDC, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ fclr t1
+#ifndef RT
+ addl C2, LDC, C
+#endif
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ fclr t3
+ fclr t4
+
+ and M, 1, I
+ ble I, $L60
+
+#if defined(LT) || defined(RN)
+
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ fclr c02
+ LD b2, 1 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L78
+
+ ble L, $L75
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ fclr c02
+ LD b2, 1 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L78
+
+ ble L, $L75
+#endif
+ .align 4
+
+$L72:
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ LD a1, 1 * SIZE(AO)
+ LD b2, 3 * SIZE(BO)
+
+ ADD c02, t3, c02
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b3, t3
+ LD b3, 4 * SIZE(BO)
+
+ ADD c06, t4, c06
+ MUL a2, b4, t4
+ LD a2, 0 * SIZE(AO)
+ LD b4, 5 * SIZE(BO)
+
+ ldi BO, 4 * SIZE(BO)
+ unop
+ unop
+ bgt L, $L72
+ .align 4
+
+$L75:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L77
+#else
+ blbs TMP1, $L77
+#endif
+ .align 4
+
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L77:
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ ADD c02, t3, c02
+ ADD c06, t4, c06
+
+ ADD c01, c02, c01
+ ldi AO, 1 * SIZE(AO)
+ ADD c05, c06, c05
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c05, t2, c05
+
+ .align 4
+
+$L78:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c05, t1, c05
+ MUL a3, c05, c05
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a2, c05, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L60:
+ and M, 2, I
+ ble I, $L70
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L68
+
+ ble L, $L65
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L68
+
+ ble L, $L65
+#endif
+ .align 4
+
+$L62:
+ ADD c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c02, t2, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L62
+ .align 4
+
+$L65:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L67
+#else
+ blbs TMP1, $L67
+#endif
+ .align 4
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t3, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L67:
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ ADD c05, t3, c05
+ MUL a1, b2, t3
+
+ ADD c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c05, t3, c05
+ ADD c06, t4, c06
+ .align 4
+
+$L68:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c02, c02
+ SUB a4, c06, c06
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c05, c05
+ SUB a4, c06, c06
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a3, c02, c02
+ MUL a3, c06, c06
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a3, c05, c05
+ MUL a3, c06, c06
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L70:
+ sra M, 2, I
+ ble I, $L79
+ .align 4
+
+$L51:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ ldi BO, 2 * SIZE(B)
+ ldi AO, 4 * SIZE(AO)
+
+ ble KK, $L58
+
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+ ldi BO, 2 * SIZE(BO)
+ ldi AO, 4 * SIZE(AO)
+
+ ble TMP1, $L58
+
+ ble L, $L55
+#endif
+ .align 4
+
+$L52:
+ ADD c05, t1, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c06, t2, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD c05, t1, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD c06, t2, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c08, t4, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD c05, t1, c05
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD c06, t2, c06
+ MUL a2, b1, t2
+ ADD c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c05, t1, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L57:
+ ADD c06, t2, c06
+ MUL a2, b1, t2
+ ADD c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD c08, t4, c08
+ MUL a4, b1, t4
+ ADD c01, t1, c01
+ MUL a1, b2, t1
+
+ ADD c02, t2, c02
+ MUL a2, b2, t2
+ ADD c03, t3, c03
+ MUL a3, b2, t3
+
+ ADD c04, t4, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c05, t1, c05
+ ADD c06, t2, c06
+ ADD c07, t3, c07
+ ADD c08, t4, c08
+ .align 4
+
+$L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c02, c02
+ SUB a4, c06, c06
+
+ SUB b1, c03, c03
+ SUB b2, c07, c07
+ SUB b3, c04, c04
+ SUB b4, c08, c08
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c05, c05
+ SUB b2, c06, c06
+ SUB b3, c07, c07
+ SUB b4, c08, c08
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a1, c08, c08
+
+ MUL a2, c04, t1
+ MUL a2, c08, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL a3, c04, t1
+ MUL a3, c08, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a4, c04, t1
+ MUL a4, c08, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b1, c07, c07
+
+ MUL b2, c03, t1
+ MUL b2, c07, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL b3, c03, t1
+ MUL b3, c07, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a3, c01, t1
+ MUL a3, c05, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL a4, c01, t1
+ MUL a4, c05, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b1, c06, c06
+
+ MUL b2, c02, t1
+ MUL b2, c06, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL b3, c02, t1
+ MUL b3, c06, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a1, c07, c07
+
+ MUL a2, c03, t1
+ MUL a2, c07, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ MUL a3, c04, c04
+ MUL a3, c08, c08
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+ MUL a2, c03, t3
+ MUL a2, c04, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a3, c05, c05
+ MUL a3, c06, c06
+ MUL a3, c07, c07
+ MUL a3, c08, c08
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+ MUL a1, c07, c07
+ MUL a1, c08, c08
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+ MUL a2, c07, t3
+ MUL a2, c08, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+ MUL a3, c03, c03
+ MUL a3, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c07, 5 * SIZE(BO)
+ ST c04, 6 * SIZE(BO)
+ ST c08, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L51
+ .align 4
+
+$L79:
+#ifdef LN
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L80:
+ and N, 1, J
+ ble J, $L999
+
+#ifdef RT
+ sll K, BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C
+#endif
+
+ mov C, C1
+#ifndef RT
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ and M, 1, I
+ ble I, $L100
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ unop
+ ble L, $L115
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L115
+#endif
+ .align 4
+
+$L112:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c02, t2, c02
+ MUL a2, b2, t2
+ LD a2, 5 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c03, t3, c03
+ MUL a3, b3, t3
+ LD a3, 6 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c04, t4, c04
+ MUL a4, b4, t4
+ LD a4, 7 * SIZE(AO)
+ LD b4, 7 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 4 * SIZE(AO)
+ ldi BO, 4 * SIZE(BO)
+ bgt L, $L112
+ .align 4
+
+$L115:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L118
+ .align 4
+
+$L116:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+ LD a1, 1 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 1 * SIZE(AO)
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L116
+ .align 4
+
+$L118:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+ ADD c01, c02, c01
+ ADD c03, c04, c03
+ ADD c01, c03, c01
+
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+
+ SUB a1, c01, c01
+#else
+ LD a1, 0 * SIZE(AO)
+
+ SUB a1, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 1 * SIZE(C1)
+#endif
+
+#ifdef RT
+ SXADDQ K, AORIG, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L100:
+ and M, 2, I
+ ble I, $L110
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L105
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ ble L, $L105
+#endif
+ .align 5
+
+$L102:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c03, t3, c03
+ ldi BO, 4 * SIZE(BO)
+ MUL a3, b2, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a5, 7 * SIZE(AO)
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, c01
+ MUL a1, b3, t1
+ LD a1, 8 * SIZE(AO)
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L102
+ .align 4
+
+$L105:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L108
+ .align 4
+
+$L106:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 2 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ LD a2, 3 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi AO, 2 * SIZE(AO)
+ unop
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L106
+ .align 4
+
+$L108:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+ ADD c01, c03, c01
+ ADD c02, c04, c02
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a2, c02, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c02, t1, c02
+ MUL a3, c02, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L110:
+ sra M, 2, I
+ ble I, $L119
+ .align 4
+
+$L91:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L95
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L95
+#endif
+ .align 5
+
+$L92:
+ ADD c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi L, -1(L)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 8 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 9 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 10 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 11 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ LD a1, 12 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD a2, 13 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b3, t3
+ LD a3, 14 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b3, t4
+ LD a5, 15 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c01, t1, c01
+ MUL a1, b4, t1
+ LD a1, 16 * SIZE(AO)
+ ldi AO, 16 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b4, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L92
+ .align 4
+
+$L95:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ unop
+ ble L, $L98
+ .align 4
+
+$L96:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 1 * SIZE(BO)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ldi AO, 4 * SIZE(AO)
+ bgt L, $L96
+ .align 4
+
+$L98:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a2, c04, t1
+ SUB c03, t1, c03
+ MUL a3, c04, t1
+ SUB c02, t1, c02
+ MUL a4, c04, t1
+ SUB c01, t1, c01
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b2, c03, t1
+ SUB c02, t1, c02
+ MUL b3, c03, t1
+ SUB c01, t1, c01
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a2, c02, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c02, t1, c02
+ MUL a3, c01, t1
+ SUB c03, t1, c03
+ MUL a4, c01, t1
+ SUB c04, t1, c04
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b2, c02, t1
+ SUB c03, t1, c03
+ MUL b3, c02, t1
+ SUB c04, t1, c04
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a2, c03, t1
+ SUB c04, t1, c04
+ MUL a3, c04, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L91
+ .align 4
+
+$L119:
+#ifdef LN
+ SXADDQ K, B, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S b/kernel/sw_64/trsm_kernel_4x4_LT.S
new file mode 100644
index 0000000..54f8a51
--- /dev/null
+++ b/kernel/sw_64/trsm_kernel_4x4_LT.S
@@ -0,0 +1,5145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 56
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+#define STACKSIZE 88
+
+#define M $16
+#define N $17
+#define K $18
+#define A $20
+#define B $21
+#define C $22
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+#define C3 $25
+#define C4 $27
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define tmp $9
+
+#define alpha $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl C, 0 + STACKSIZE($sp)
+ ldl LDC, 8 + STACKSIZE($sp)
+ ldl OFFSET, 16 + STACKSIZE($sp)
+
+ SXADDQ LDC, 0, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ stl $9, 64($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ mulq M, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ M, C, C
+#endif
+
+#ifdef RN
+ negl OFFSET, KK
+#endif
+
+#ifdef RT
+ mulq N, K, TMP1
+ SXADDQ TMP1, B, B
+
+ mulq N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ sra N, 2, J
+ ble J, $L40
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ s4addl LDC, 0, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C3
+#ifndef RT
+ s4addl LDC, C, C
+#endif
+
+ fclr t1
+ addl C3, LDC, C4
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+
+ LD b3, 2 * SIZE(B)
+ fclr c06
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(KK)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(B)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ flds $f31, 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble KK, $L18
+#else
+
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+
+ LD b3, 2 * SIZE(BO)
+ fclr c06
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(TMP1)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(BO)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble TMP1, $L18
+#endif
+
+ ble L, $L15
+ .align 5
+
+$L12:
+/* 1 */
+ ADD c11, t1, b5
+ fmov b5, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+ FIMOVD b5, tmp
+/* 2 */
+ ADD c01, t1, b5
+ fmov b5, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD c11, t1, b5
+ fmov b5, c11
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ ldi L, -2(L)
+ IFMOVD tmp, b5
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a6, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a4, t2
+ unop
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD c03, t1, b5
+ fmov b5, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD c11, t1, b5
+ fmov b5, c11
+ MUL b1, a1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, b5
+ fmov b5, t2
+ ADD c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, b5
+ fmov b5, t3
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL b1, a4, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, b5
+ fmov b5, t3
+ ADD c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, b5
+ fmov b5, t4
+
+ ADD c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, b5
+ fmov b5, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, b5
+ fmov b5, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, b5
+ fmov b5, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, b5
+ fmov b5, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, b5
+ fmov b5, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, b5
+ fmov b5, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD c11, t1, b5
+ fmov b5, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, b5
+ fmov b5, t2
+ ADD c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, b5
+ fmov b5, t3
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL b1, a4, b5
+ fmov b5, t2
+ ADD c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, b5
+ fmov b5, t3
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, b5
+ fmov b5, t4
+ ADD c03, t1, b5
+ fmov b5, c03
+ MUL b3, a1, b5
+ fmov b5, t1
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ MUL b3, a2, b5
+ fmov b5, t2
+ ADD c08, t3, b5
+ fmov b5, c08
+ MUL b4, a2, b5
+ fmov b5, t3
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL b2, a3, b5
+ fmov b5, t4
+ ADD c09, t1, b5
+ fmov b5, c09
+ MUL b3, a3, b5
+ fmov b5, t1
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ MUL b3, a4, b5
+ fmov b5, t2
+ ADD c14, t3, b5
+ fmov b5, c14
+ MUL b4, a4, b5
+ fmov b5, t3
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c11, t1, b5
+ fmov b5, c11
+ ADD c12, t2, b5
+ fmov b5, c12
+ ADD c16, t3, b5
+ fmov b5, c16
+ ADD c15, t4, b5
+ fmov b5, c15
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+
+ SUB b1, c02, b5
+ fmov b5, c02
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c10, b5
+ fmov b5, c10
+ SUB b4, c14, b5
+ fmov b5, c14
+
+ LD a1, 8 * SIZE(BO)
+ LD a2, 9 * SIZE(BO)
+ LD a3, 10 * SIZE(BO)
+ LD a4, 11 * SIZE(BO)
+
+ LD b1, 12 * SIZE(BO)
+ LD b2, 13 * SIZE(BO)
+ LD b3, 14 * SIZE(BO)
+ LD b4, 15 * SIZE(BO)
+
+ SUB a1, c03, b5
+ fmov b5, c03
+ SUB a2, c07, b5
+ fmov b5, c07
+ SUB a3, c11, b5
+ fmov b5, c11
+ SUB a4, c15, b5
+ fmov b5, c15
+
+ SUB b1, c04, b5
+ fmov b5, c04
+ SUB b2, c08, b5
+ fmov b5, c08
+ SUB b3, c12, b5
+ fmov b5, c12
+ SUB b4, c16, b5
+ fmov b5, c16
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c05, b5
+ fmov b5, c05
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c07, b5
+ fmov b5, c07
+ SUB b4, c08, b5
+ fmov b5, c08
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 12 * SIZE(AO)
+ LD b2, 13 * SIZE(AO)
+ LD b3, 14 * SIZE(AO)
+ LD b4, 15 * SIZE(AO)
+
+ SUB a1, c09, b5
+ fmov b5, c09
+ SUB a2, c10, b5
+ fmov b5, c10
+ SUB a3, c11, b5
+ fmov b5, c11
+ SUB a4, c12, b5
+ fmov b5, c12
+
+ SUB b1, c13, b5
+ fmov b5, c13
+ SUB b2, c14, b5
+ fmov b5, c14
+ SUB b3, c15, b5
+ fmov b5, c15
+ SUB b4, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c08, b5
+ fmov b5, c08
+ MUL a1, c12, b5
+ fmov b5, c12
+ MUL a1, c16, b5
+ fmov b5, c16
+
+ MUL a2, c04, b5
+ fmov b5, t1
+ MUL a2, c08, b5
+ fmov b5, t2
+ MUL a2, c12, b5
+ fmov b5, t3
+ MUL a2, c16, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL a3, c04, b5
+ fmov b5, t1
+ MUL a3, c08, b5
+ fmov b5, t2
+ MUL a3, c12, b5
+ fmov b5, t3
+ MUL a3, c16, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a4, c04, b5
+ fmov b5, t1
+ MUL a4, c08, b5
+ fmov b5, t2
+ MUL a4, c12, b5
+ fmov b5, t3
+ MUL a4, c16, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b1, c07, b5
+ fmov b5, c07
+ MUL b1, c11, b5
+ fmov b5, c11
+ MUL b1, c15, b5
+ fmov b5, c15
+
+ MUL b2, c03, b5
+ fmov b5, t1
+ MUL b2, c07, b5
+ fmov b5, t2
+ MUL b2, c11, b5
+ fmov b5, t3
+ MUL b2, c15, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL b3, c03, b5
+ fmov b5, t1
+ MUL b3, c07, b5
+ fmov b5, t2
+ MUL b3, c11, b5
+ fmov b5, t3
+ MUL b3, c15, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c10, b5
+ fmov b5, t3
+ MUL a2, c14, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c09, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+ MUL a2, c09, b5
+ fmov b5, t3
+ MUL a2, c13, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c05, b5
+ fmov b5, t2
+ MUL a3, c09, b5
+ fmov b5, t3
+ MUL a3, c13, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c05, b5
+ fmov b5, t2
+ MUL a4, c09, b5
+ fmov b5, t3
+ MUL a4, c13, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b1, c06, b5
+ fmov b5, c06
+ MUL b1, c10, b5
+ fmov b5, c10
+ MUL b1, c14, b5
+ fmov b5, c14
+
+ MUL b2, c02, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+ MUL b2, c10, b5
+ fmov b5, t3
+ MUL b2, c14, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL b3, c02, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+ MUL b3, c10, b5
+ fmov b5, t3
+ MUL b3, c14, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c15, b5
+ fmov b5, c15
+
+ MUL a2, c03, b5
+ fmov b5, t1
+ MUL a2, c07, b5
+ fmov b5, t2
+ MUL a2, c11, b5
+ fmov b5, t3
+ MUL a2, c15, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ MUL a3, c04, b5
+ fmov b5, c04
+ MUL a3, c08, b5
+ fmov b5, c08
+ MUL a3, c12, b5
+ fmov b5, c12
+ MUL a3, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+ MUL a2, c03, b5
+ fmov b5, t3
+ MUL a2, c04, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c02, b5
+ fmov b5, t2
+ MUL a3, c03, b5
+ fmov b5, t3
+ MUL a3, c04, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c02, b5
+ fmov b5, t2
+ MUL a4, c03, b5
+ fmov b5, t3
+ MUL a4, c04, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b1, c06, b5
+ fmov b5, c06
+ MUL b1, c07, b5
+ fmov b5, c07
+ MUL b1, c08, b5
+ fmov b5, c08
+
+ MUL b2, c05, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+ MUL b2, c07, b5
+ fmov b5, t3
+ MUL b2, c08, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL b3, c05, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+ MUL b3, c07, b5
+ fmov b5, t3
+ MUL b3, c08, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ MUL a2, c09, b5
+ fmov b5, t1
+ MUL a2, c10, b5
+ fmov b5, t2
+ MUL a2, c11, b5
+ fmov b5, t3
+ MUL a2, c12, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ MUL a3, c13, b5
+ fmov b5, c13
+ MUL a3, c14, b5
+ fmov b5, c14
+ MUL a3, c15, b5
+ fmov b5, c15
+ MUL a3, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a1, c14, b5
+ fmov b5, c14
+ MUL a1, c15, b5
+ fmov b5, c15
+ MUL a1, c16, b5
+ fmov b5, c16
+
+ MUL a2, c13, b5
+ fmov b5, t1
+ MUL a2, c14, b5
+ fmov b5, t2
+ MUL a2, c15, b5
+ fmov b5, t3
+ MUL a2, c16, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c13, b5
+ fmov b5, t1
+ MUL a3, c14, b5
+ fmov b5, t2
+ MUL a3, c15, b5
+ fmov b5, t3
+ MUL a3, c16, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a4, c13, b5
+ fmov b5, t1
+ MUL a4, c14, b5
+ fmov b5, t2
+ MUL a4, c15, b5
+ fmov b5, t3
+ MUL a4, c16, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b1, c10, b5
+ fmov b5, c10
+ MUL b1, c11, b5
+ fmov b5, c11
+ MUL b1, c12, b5
+ fmov b5, c12
+
+ MUL b2, c09, b5
+ fmov b5, t1
+ MUL b2, c10, b5
+ fmov b5, t2
+ MUL b2, c11, b5
+ fmov b5, t3
+ MUL b2, c12, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL b3, c09, b5
+ fmov b5, t1
+ MUL b3, c10, b5
+ fmov b5, t2
+ MUL b3, c11, b5
+ fmov b5, t3
+ MUL b3, c12, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c07, b5
+ fmov b5, t3
+ MUL a2, c08, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c03, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+
+ ST c03, 8 * SIZE(BO)
+ ST c07, 9 * SIZE(BO)
+ ST c11, 10 * SIZE(BO)
+ ST c15, 11 * SIZE(BO)
+
+ ST c04, 12 * SIZE(BO)
+ ST c08, 13 * SIZE(BO)
+ ST c12, 14 * SIZE(BO)
+ ST c16, 15 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+
+ ST c09, 8 * SIZE(AO)
+ ST c10, 9 * SIZE(AO)
+ ST c11, 10 * SIZE(AO)
+ ST c12, 11 * SIZE(AO)
+
+ ST c13, 12 * SIZE(AO)
+ ST c14, 13 * SIZE(AO)
+ ST c15, 14 * SIZE(AO)
+ ST c16, 15 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+ ldi C3, -4 * SIZE(C3)
+ ldi C4, -4 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c11, 2 * SIZE(C3)
+ ST c12, 3 * SIZE(C3)
+
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+ ST c15, 2 * SIZE(C4)
+ ST c16, 3 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+ ldi C3, 4 * SIZE(C3)
+ ldi C4, 4 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 2, I
+ ble I, $L30
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c01
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ ldi BO, 4 * SIZE(B)
+ fclr c02
+ fclr c06
+ ble KK, $L28
+
+ ble L, $L25
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c01
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c02
+ fclr c06
+ ble TMP1, $L28
+
+ ble L, $L25
+#endif
+ .align 4
+
+$L22:
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+ FIMOVD b5, tmp
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ IFMOVD tmp, b5
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ IFMOVD tmp, b5
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD c09, t1, b5
+ fmov b5, c09
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, b5
+ fmov b5, t3
+ unop
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, b5
+ fmov b5, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b4, b5
+ fmov b5, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD c10, t2, b5
+ fmov b5, c10
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c13, t3, b5
+ fmov b5, c13
+ MUL a1, b2, b5
+ fmov b5, t3
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ MUL a2, b2, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, b5
+ fmov b5, t2
+ ADD c05, t3, b5
+ fmov b5, c05
+ MUL a1, b4, b5
+ fmov b5, t3
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ ADD c10, t2, b5
+ fmov b5, c10
+ ADD c13, t3, b5
+ fmov b5, c13
+ ADD c14, t4, b5
+ fmov b5, c14
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+
+ SUB b1, c02, b5
+ fmov b5, c02
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c10, b5
+ fmov b5, c10
+ SUB b4, c14, b5
+ fmov b5, c14
+
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c05, b5
+ fmov b5, c05
+ SUB a4, c06, b5
+ fmov b5, c06
+
+ SUB b1, c09, b5
+ fmov b5, c09
+ SUB b2, c10, b5
+ fmov b5, c10
+ SUB b3, c13, b5
+ fmov b5, c13
+ SUB b4, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c10, b5
+ fmov b5, t3
+ MUL a2, c14, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c09, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+ MUL a2, c09, b5
+ fmov b5, t3
+ MUL a2, c13, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c06, b5
+ fmov b5, c06
+ MUL a3, c10, b5
+ fmov b5, c10
+ MUL a3, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c02, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c02, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b1, c06, b5
+ fmov b5, c06
+
+ MUL b2, c05, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL b3, c05, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ MUL a2, c09, b5
+ fmov b5, t1
+ MUL a2, c10, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ MUL a3, c13, b5
+ fmov b5, c13
+ MUL a3, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c13, b5
+ fmov b5, t1
+ MUL a2, c14, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a3, c13, b5
+ fmov b5, t1
+ MUL a3, c14, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a4, c13, b5
+ fmov b5, t1
+ MUL a4, c14, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b1, c10, b5
+ fmov b5, c10
+
+ MUL b2, c09, b5
+ fmov b5, t1
+ MUL b2, c10, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL b3, c09, b5
+ fmov b5, t1
+ MUL b3, c10, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c13, 6 * SIZE(AO)
+ ST c14, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+ ldi C3, -2 * SIZE(C3)
+ ldi C4, -2 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+ ldi C3, 2 * SIZE(C3)
+ ldi C4, 2 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L30:
+ and M, 1, I
+ ble I, $L39
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c09
+ LD b4, 3 * SIZE(B)
+ fclr c13
+
+ ldi BO, 4 * SIZE(B)
+ ble KK, $L38
+
+ ble L, $L35
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c09
+ LD b4, 3 * SIZE(BO)
+ fclr c13
+
+ ldi BO, 4 * SIZE(BO)
+ ble TMP1, $L38
+
+ ble L, $L35
+#endif
+ .align 4
+
+$L32:
+ ADD c01, t1, b5
+ fmov b5,c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ ldi AO, 2 * SIZE(AO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ LD b5, 3 * SIZE(BO)
+ FIMOVD b5, tmp
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL a1, b4, t4
+ LD a1, -1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a2, b1, t1
+ LD b1, 4 * SIZE(BO)
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a2, b2, t2
+ LD b2, -3 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ LD b4, -1 * SIZE(BO)
+ MUL a2, b3, t3
+ LD b3, -2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ IFMOVD tmp, b5
+ MUL a2, b5, t4
+ LD a2, 0 * SIZE(AO)
+ bgt L, $L32
+ .align 4
+
+$L35:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L37
+#else
+ blbs TMP1, $L37
+#endif
+ .align 4
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ LD b1, 0 * SIZE(BO)
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ MUL a1, b3, b5
+ fmov b5, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL a1, b4, b5
+ fmov b5, t4
+ LD a1, 0 * SIZE(AO)
+ ldi AO, 1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L37:
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ ADD c09, t3, b5
+ fmov b5, c09
+ MUL a1, b3, b5
+ fmov b5, t3
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b4, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c05, t2, b5
+ fmov b5, c05
+ ADD c09, t3, b5
+ fmov b5, c09
+ ADD c13, t4, b5
+ fmov b5, c13
+
+$L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a3, c01, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL a4, c01, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b2, c05, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL b3, c05, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a2, c09, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a2, c13, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a4, c13, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b2, c09, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL b3, c09, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a2, c05, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c13, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+ ldi C3, -1 * SIZE(C3)
+ ldi C4, -1 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c09, 0 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L39:
+#ifdef LN
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 4, KK
+#endif
+
+#ifdef RT
+ subl KK, 4, KK
+#endif
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L40:
+ and N, 2, J
+ ble J, $L80
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ addl LDC, LDC, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ fclr t1
+#ifndef RT
+ addl C2, LDC, C
+#endif
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L60
+ .align 4
+
+$L51:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ ldi BO, 2 * SIZE(B)
+ ldi AO, 4 * SIZE(AO)
+
+ ble KK, $L58
+
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+ ldi BO, 2 * SIZE(BO)
+ ldi AO, 4 * SIZE(AO)
+
+ ble TMP1, $L58
+
+ ble L, $L55
+#endif
+ .align 4
+
+$L52:
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ unop
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ ldi L, -2(L)
+ MUL a2, b1, b5
+ fmov b5, t2
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ unop
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ unop
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b3, b5
+ fmov b5, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a5, b3, b5
+ fmov b5, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b4, b5
+ fmov b5, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b4, b5
+ fmov b5, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD c05, t1, b5
+ fmov b5, c05
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, b5
+ fmov b5, t3
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L57:
+ ADD c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, b5
+ fmov b5, t3
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ MUL a4, b1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b2, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, b5
+ fmov b5, t2
+ ADD c03, t3, b5
+ fmov b5, c03
+ MUL a3, b2, b5
+ fmov b5, t3
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, b5
+ fmov b5, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ ADD c06, t2, b5
+ fmov b5, c06
+ ADD c07, t3, b5
+ fmov b5, c07
+ ADD c08, t4, b5
+ fmov b5, c08
+ .align 4
+
+$L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c02, b5
+ fmov b5, c02
+ SUB a4, c06, b5
+ fmov b5, c06
+
+ SUB b1, c03, b5
+ fmov b5, c03
+ SUB b2, c07, b5
+ fmov b5, c07
+ SUB b3, c04, b5
+ fmov b5, c04
+ SUB b4, c08, b5
+ fmov b5, c08
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c05, b5
+ fmov b5, c05
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c07, b5
+ fmov b5, c07
+ SUB b4, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c04, b5
+ fmov b5, t1
+ MUL a2, c08, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL a3, c04, b5
+ fmov b5, t1
+ MUL a3, c08, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a4, c04, b5
+ fmov b5, t1
+ MUL a4, c08, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b1, c07, b5
+ fmov b5, c07
+
+ MUL b2, c03, b5
+ fmov b5, t1
+ MUL b2, c07, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL b3, c03, b5
+ fmov b5, t1
+ MUL b3, c07, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c05, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c05, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b1, c06, b5
+ fmov b5, c06
+
+ MUL b2, c02, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL b3, c02, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c07, b5
+ fmov b5, c07
+
+ MUL a2, c03, b5
+ fmov b5, t1
+ MUL a2, c07, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ MUL a3, c04, b5
+ fmov b5, c04
+ MUL a3, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+ MUL a2, c03, b5
+ fmov b5, t3
+ MUL a2, c04, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c06, b5
+ fmov b5, c06
+ MUL a3, c07, b5
+ fmov b5, c07
+ MUL a3, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c07, b5
+ fmov b5, t3
+ MUL a2, c08, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c03, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c07, 5 * SIZE(BO)
+ ST c04, 6 * SIZE(BO)
+ ST c08, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L51
+ .align 4
+
+$L60:
+ and M, 2, I
+ ble I, $L70
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L68
+
+ ble L, $L65
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L68
+
+ ble L, $L65
+#endif
+ .align 4
+
+$L62:
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ ldi L, -2(L)
+ MUL a1, b2, b5
+ fmov b5, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, b5
+ fmov b5, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a4, b4, b5
+ fmov b5, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L62
+ .align 4
+
+$L65:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L67
+#else
+ blbs TMP1, $L67
+#endif
+ .align 4
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, b5
+ fmov b5, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L67:
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c05, t3, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t3
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, b5
+ fmov b5, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c05, t3, b5
+ fmov b5, c05
+ ADD c06, t4, b5
+ fmov b5, c06
+ .align 4
+
+$L68:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c02, b5
+ fmov b5, c02
+ SUB a4, c06, b5
+ fmov b5, c06
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c05, b5
+ fmov b5, c05
+ SUB a4, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L70:
+ and M, 1, I
+ ble I, $L79
+
+#if defined(LT) || defined(RN)
+
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ fclr c02
+ LD b2, 1 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L78
+
+ ble L, $L75
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ fclr c02
+ LD b2, 1 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L78
+
+ ble L, $L75
+#endif
+ .align 4
+
+$L72:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD a1, 1 * SIZE(AO)
+ LD b2, 3 * SIZE(BO)
+
+ ADD c02, t3, b5
+ fmov b5, c02
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b3, b5
+ fmov b5, t3
+ LD b3, 4 * SIZE(BO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, b5
+ fmov b5, t4
+ LD a2, 0 * SIZE(AO)
+ LD b4, 5 * SIZE(BO)
+
+ ldi BO, 4 * SIZE(BO)
+ unop
+ unop
+ bgt L, $L72
+ .align 4
+
+$L75:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L77
+#else
+ blbs TMP1, $L77
+#endif
+ .align 4
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L77:
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ ADD c02, t3, b5
+ fmov b5, c02
+ ADD c06, t4, b5
+ fmov b5, c06
+
+ ADD c01, c02, b5
+ fmov b5, c01
+ ldi AO, 1 * SIZE(AO)
+ ADD c05, c06, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c05, t2, b5
+ fmov b5, c05
+
+ .align 4
+
+$L78:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a2, c05, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L79:
+#ifdef LN
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L80:
+ and N, 1, J
+ ble J, $L999
+
+#ifdef RT
+ sll K, BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C
+#endif
+
+ mov C, C1
+#ifndef RT
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ ble I, $L100
+ .align 4
+
+$L91:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L95
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L95
+#endif
+ .align 5
+
+$L92:
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi L, -1(L)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 8 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 9 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 10 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a4, 11 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD a1, 12 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD a2, 13 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b3, b5
+ fmov b5, t3
+ LD a3, 14 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b3, b5
+ fmov b5, t4
+ LD a5, 15 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b4, b5
+ fmov b5, t1
+ LD a1, 16 * SIZE(AO)
+ ldi AO, 16 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b4, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L92
+ .align 4
+
+$L95:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ unop
+ ble L, $L98
+ .align 4
+
+$L96:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 1 * SIZE(BO)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ldi AO, 4 * SIZE(AO)
+ bgt L, $L96
+ .align 4
+
+$L98:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a2, c04, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a4, c04, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b2, c03, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL b3, c03, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a2, c02, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a3, c01, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL a4, c01, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b2, c02, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL b3, c02, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a2, c03, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L91
+ .align 4
+
+$L100:
+ and M, 2, I
+ ble I, $L110
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L105
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ ble L, $L105
+#endif
+ .align 5
+
+$L102:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ ldi BO, 4 * SIZE(BO)
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a5, 7 * SIZE(AO)
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD a1, 8 * SIZE(AO)
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L102
+ .align 4
+
+$L105:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L108
+ .align 4
+
+$L106:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 2 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 3 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi AO, 2 * SIZE(AO)
+ unop
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L106
+ .align 4
+
+$L108:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+ ADD c01, c03, b5
+ fmov b5, c01
+ ADD c02, c04, b5
+ fmov b5, c02
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a2, c02, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L110:
+ and M, 1, I
+ ble I, $L119
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ unop
+ ble L, $L115
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L115
+#endif
+ .align 4
+
+$L112:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ MUL a3, b3, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b4, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b4, 7 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 4 * SIZE(AO)
+ ldi BO, 4 * SIZE(BO)
+ bgt L, $L112
+ .align 4
+
+$L115:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L118
+ .align 4
+
+$L116:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 1 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 1 * SIZE(AO)
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L116
+ .align 4
+
+$L118:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+ ADD c01, c02, b5
+ fmov b5, c01
+ ADD c03, c04, b5
+ fmov b5, c03
+ ADD c01, c03, b5
+ fmov b5, c01
+
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+#else
+ LD a1, 0 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 1 * SIZE(C1)
+#endif
+
+#ifdef RT
+ SXADDQ K, AORIG, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L119:
+#ifdef LN
+ SXADDQ K, B, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl $9, 64($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/trsm_kernel_4x4_LT.S.bak b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak
new file mode 100644
index 0000000..86136ae
--- /dev/null
+++ b/kernel/sw_64/trsm_kernel_4x4_LT.S.bak
@@ -0,0 +1,4072 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW2B
+#define PREFETCHSIZE 56
+#define UNOP nop
+#endif
+
+#ifdef EV6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 56
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $20
+#define B $21
+#define C $22
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+#define C3 $25
+#define C4 $27
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl C, 0 + STACKSIZE($sp)
+ ldl LDC, 8 + STACKSIZE($sp)
+ ldl OFFSET, 16 + STACKSIZE($sp)
+
+ SXADDQ LDC, 0, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ mull M, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ M, C, C
+#endif
+
+#ifdef RN
+ negl OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ sra N, 2, J
+ ble J, $L40
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ s4addl LDC, 0, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C3
+#ifndef RT
+ s4addl LDC, C, C
+#endif
+
+ fclr t1
+ addl C3, LDC, C4
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+
+ LD b3, 2 * SIZE(B)
+ fclr c06
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(KK)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(B)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble KK, $L18
+#else
+
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+
+ LD b3, 2 * SIZE(BO)
+ fclr c06
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(TMP1)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(BO)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble TMP1, $L18
+#endif
+
+ ble L, $L15
+ .align 5
+
+$L12:
+/* 1 */
+ ADD c11, t1, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD c01, t1, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD c02, t2, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD c11, t1, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c12, t2, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD c01, t1, c01
+ unop
+ MUL b5, a6, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD c03, t1, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD c04, t2, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD c09, t1, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD c11, t1, c11
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD c12, t2, c12
+ MUL b1, a2, t2
+ ADD c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD c15, t4, c15
+ MUL b2, a1, t4
+ ADD c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD c02, t2, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c06, t3, c06
+ MUL b2, a4, t3
+ ADD c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD c11, t1, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD c12, t2, c12
+ MUL b1, a2, t2
+ ADD c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD c15, t4, c15
+ MUL b2, a1, t4
+ ADD c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD c02, t2, c02
+ MUL b1, a4, t2
+ ADD c06, t3, c06
+ MUL b2, a4, t3
+
+ ADD c05, t4, c05
+ MUL b4, a1, t4
+ ADD c03, t1, c03
+ MUL b3, a1, t1
+
+ ADD c04, t2, c04
+ MUL b3, a2, t2
+ ADD c08, t3, c08
+ MUL b4, a2, t3
+
+ ADD c13, t4, c13
+ MUL b2, a3, t4
+ ADD c09, t1, c09
+ MUL b3, a3, t1
+
+ ADD c10, t2, c10
+ MUL b3, a4, t2
+ ADD c14, t3, c14
+ MUL b4, a4, t3
+
+ ADD c07, t4, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c11, t1, c11
+ ADD c12, t2, c12
+ ADD c16, t3, c16
+ ADD c15, t4, c15
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+
+ SUB b1, c02, c02
+ SUB b2, c06, c06
+ SUB b3, c10, c10
+ SUB b4, c14, c14
+
+ LD a1, 8 * SIZE(BO)
+ LD a2, 9 * SIZE(BO)
+ LD a3, 10 * SIZE(BO)
+ LD a4, 11 * SIZE(BO)
+
+ LD b1, 12 * SIZE(BO)
+ LD b2, 13 * SIZE(BO)
+ LD b3, 14 * SIZE(BO)
+ LD b4, 15 * SIZE(BO)
+
+ SUB a1, c03, c03
+ SUB a2, c07, c07
+ SUB a3, c11, c11
+ SUB a4, c15, c15
+
+ SUB b1, c04, c04
+ SUB b2, c08, c08
+ SUB b3, c12, c12
+ SUB b4, c16, c16
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c05, c05
+ SUB b2, c06, c06
+ SUB b3, c07, c07
+ SUB b4, c08, c08
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 12 * SIZE(AO)
+ LD b2, 13 * SIZE(AO)
+ LD b3, 14 * SIZE(AO)
+ LD b4, 15 * SIZE(AO)
+
+ SUB a1, c09, c09
+ SUB a2, c10, c10
+ SUB a3, c11, c11
+ SUB a4, c12, c12
+
+ SUB b1, c13, c13
+ SUB b2, c14, c14
+ SUB b3, c15, c15
+ SUB b4, c16, c16
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a1, c08, c08
+ MUL a1, c12, c12
+ MUL a1, c16, c16
+
+ MUL a2, c04, t1
+ MUL a2, c08, t2
+ MUL a2, c12, t3
+ MUL a2, c16, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL a3, c04, t1
+ MUL a3, c08, t2
+ MUL a3, c12, t3
+ MUL a3, c16, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a4, c04, t1
+ MUL a4, c08, t2
+ MUL a4, c12, t3
+ MUL a4, c16, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b1, c07, c07
+ MUL b1, c11, c11
+ MUL b1, c15, c15
+
+ MUL b2, c03, t1
+ MUL b2, c07, t2
+ MUL b2, c11, t3
+ MUL b2, c15, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL b3, c03, t1
+ MUL b3, c07, t2
+ MUL b3, c11, t3
+ MUL b3, c15, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+ MUL a1, c10, c10
+ MUL a1, c14, c14
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+ MUL a2, c10, t3
+ MUL a2, c14, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+ MUL a3, c09, c09
+ MUL a3, c13, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+ MUL a2, c09, t3
+ MUL a2, c13, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a3, c01, t1
+ MUL a3, c05, t2
+ MUL a3, c09, t3
+ MUL a3, c13, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL a4, c01, t1
+ MUL a4, c05, t2
+ MUL a4, c09, t3
+ MUL a4, c13, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b1, c06, c06
+ MUL b1, c10, c10
+ MUL b1, c14, c14
+
+ MUL b2, c02, t1
+ MUL b2, c06, t2
+ MUL b2, c10, t3
+ MUL b2, c14, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL b3, c02, t1
+ MUL b3, c06, t2
+ MUL b3, c10, t3
+ MUL b3, c14, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a1, c07, c07
+ MUL a1, c11, c11
+ MUL a1, c15, c15
+
+ MUL a2, c03, t1
+ MUL a2, c07, t2
+ MUL a2, c11, t3
+ MUL a2, c15, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ MUL a3, c04, c04
+ MUL a3, c08, c08
+ MUL a3, c12, c12
+ MUL a3, c16, c16
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+ MUL a2, c03, t3
+ MUL a2, c04, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c01, t1
+ MUL a4, c02, t2
+ MUL a4, c03, t3
+ MUL a4, c04, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b1, c06, c06
+ MUL b1, c07, c07
+ MUL b1, c08, c08
+
+ MUL b2, c05, t1
+ MUL b2, c06, t2
+ MUL b2, c07, t3
+ MUL b2, c08, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL b3, c05, t1
+ MUL b3, c06, t2
+ MUL b3, c07, t3
+ MUL b3, c08, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ MUL a2, c09, t1
+ MUL a2, c10, t2
+ MUL a2, c11, t3
+ MUL a2, c12, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ MUL a3, c13, c13
+ MUL a3, c14, c14
+ MUL a3, c15, c15
+ MUL a3, c16, c16
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a1, c14, c14
+ MUL a1, c15, c15
+ MUL a1, c16, c16
+
+ MUL a2, c13, t1
+ MUL a2, c14, t2
+ MUL a2, c15, t3
+ MUL a2, c16, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a3, c13, t1
+ MUL a3, c14, t2
+ MUL a3, c15, t3
+ MUL a3, c16, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a4, c13, t1
+ MUL a4, c14, t2
+ MUL a4, c15, t3
+ MUL a4, c16, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b1, c10, c10
+ MUL b1, c11, c11
+ MUL b1, c12, c12
+
+ MUL b2, c09, t1
+ MUL b2, c10, t2
+ MUL b2, c11, t3
+ MUL b2, c12, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL b3, c09, t1
+ MUL b3, c10, t2
+ MUL b3, c11, t3
+ MUL b3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+ MUL a1, c07, c07
+ MUL a1, c08, c08
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+ MUL a2, c07, t3
+ MUL a2, c08, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+ MUL a3, c03, c03
+ MUL a3, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+
+ ST c03, 8 * SIZE(BO)
+ ST c07, 9 * SIZE(BO)
+ ST c11, 10 * SIZE(BO)
+ ST c15, 11 * SIZE(BO)
+
+ ST c04, 12 * SIZE(BO)
+ ST c08, 13 * SIZE(BO)
+ ST c12, 14 * SIZE(BO)
+ ST c16, 15 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+
+ ST c09, 8 * SIZE(AO)
+ ST c10, 9 * SIZE(AO)
+ ST c11, 10 * SIZE(AO)
+ ST c12, 11 * SIZE(AO)
+
+ ST c13, 12 * SIZE(AO)
+ ST c14, 13 * SIZE(AO)
+ ST c15, 14 * SIZE(AO)
+ ST c16, 15 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+ ldi C3, -4 * SIZE(C3)
+ ldi C4, -4 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c11, 2 * SIZE(C3)
+ ST c12, 3 * SIZE(C3)
+
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+ ST c15, 2 * SIZE(C4)
+ ST c16, 3 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+ ldi C3, 4 * SIZE(C3)
+ ldi C4, 4 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 2, I
+ ble I, $L30
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c01
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ ldi BO, 4 * SIZE(B)
+ fclr c02
+ fclr c06
+ ble KK, $L28
+
+ ble L, $L25
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c01
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c02
+ fclr c06
+ ble TMP1, $L28
+
+ ble L, $L25
+#endif
+ .align 4
+
+$L22:
+ ADD c09, t1, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD c09, t1, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c14, t4, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD c09, t1, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c09, t1, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD c10, t2, c10
+ MUL a2, b1, t2
+ ADD c13, t3, c13
+ MUL a1, b2, t3
+
+ ADD c14, t4, c14
+ MUL a2, b2, t4
+ ADD c01, t1, c01
+ MUL a1, b3, t1
+
+ ADD c02, t2, c02
+ MUL a2, b3, t2
+ ADD c05, t3, c05
+ MUL a1, b4, t3
+
+ ADD c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c09, t1, c09
+ ADD c10, t2, c10
+ ADD c13, t3, c13
+ ADD c14, t4, c14
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+
+ SUB b1, c02, c02
+ SUB b2, c06, c06
+ SUB b3, c10, c10
+ SUB b4, c14, c14
+
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c05, c05
+ SUB a4, c06, c06
+
+ SUB b1, c09, c09
+ SUB b2, c10, c10
+ SUB b3, c13, c13
+ SUB b4, c14, c14
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+ MUL a1, c10, c10
+ MUL a1, c14, c14
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+ MUL a2, c10, t3
+ MUL a2, c14, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+ MUL a3, c09, c09
+ MUL a3, c13, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+ MUL a2, c09, t3
+ MUL a2, c13, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a3, c02, c02
+ MUL a3, c06, c06
+ MUL a3, c10, c10
+ MUL a3, c14, c14
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a4, c01, t1
+ MUL a4, c02, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b1, c06, c06
+
+ MUL b2, c05, t1
+ MUL b2, c06, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL b3, c05, t1
+ MUL b3, c06, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ MUL a2, c09, t1
+ MUL a2, c10, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ MUL a3, c13, c13
+ MUL a3, c14, c14
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a1, c14, c14
+
+ MUL a2, c13, t1
+ MUL a2, c14, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a3, c13, t1
+ MUL a3, c14, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a4, c13, t1
+ MUL a4, c14, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b1, c10, c10
+
+ MUL b2, c09, t1
+ MUL b2, c10, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL b3, c09, t1
+ MUL b3, c10, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c13, 6 * SIZE(AO)
+ ST c14, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+ ldi C3, -2 * SIZE(C3)
+ ldi C4, -2 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+ ldi C3, 2 * SIZE(C3)
+ ldi C4, 2 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L30:
+ and M, 1, I
+ ble I, $L39
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c09
+ LD b4, 3 * SIZE(B)
+ fclr c13
+
+ ldi BO, 4 * SIZE(B)
+ ble KK, $L38
+
+ ble L, $L35
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c09
+ LD b4, 3 * SIZE(BO)
+ fclr c13
+
+ ldi BO, 4 * SIZE(BO)
+ ble TMP1, $L38
+
+ ble L, $L35
+#endif
+ .align 4
+
+$L32:
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t2, c05
+ ldi AO, 2 * SIZE(AO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, c09
+ LD b5, 3 * SIZE(BO)
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a1, b4, t4
+ LD a1, -1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ MUL a2, b1, t1
+ LD b1, 4 * SIZE(BO)
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c05, t2, c05
+ MUL a2, b2, t2
+ LD b2, -3 * SIZE(BO)
+
+ ADD c09, t3, c09
+ LD b4, -1 * SIZE(BO)
+ MUL a2, b3, t3
+ LD b3, -2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a2, b5, t4
+ LD a2, 0 * SIZE(AO)
+ bgt L, $L32
+ .align 4
+
+$L35:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L37
+#else
+ blbs TMP1, $L37
+#endif
+ .align 4
+
+ ADD c05, t2, c05
+ LD b1, 0 * SIZE(BO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, c09
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a1, b4, t4
+ LD a1, 0 * SIZE(AO)
+ ldi AO, 1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L37:
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ ADD c09, t3, c09
+ MUL a1, b3, t3
+
+ ADD c13, t4, c13
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c05, t2, c05
+ ADD c09, t3, c09
+ ADD c13, t4, c13
+
+$L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c05, t1, c05
+ MUL a3, c01, t1
+ SUB c09, t1, c09
+ MUL a4, c01, t1
+ SUB c13, t1, c13
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b2, c05, t1
+ SUB c09, t1, c09
+ MUL b3, c05, t1
+ SUB c13, t1, c13
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a2, c09, t1
+ SUB c13, t1, c13
+ MUL a3, c13, c13
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a2, c13, t1
+ SUB c09, t1, c09
+ MUL a3, c13, t1
+ SUB c05, t1, c05
+ MUL a4, c13, t1
+ SUB c01, t1, c01
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b2, c09, t1
+ SUB c05, t1, c05
+ MUL b3, c09, t1
+ SUB c01, t1, c01
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a2, c05, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c13, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+ ldi C3, -1 * SIZE(C3)
+ ldi C4, -1 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c09, 0 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L39:
+#ifdef LN
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 4, KK
+#endif
+
+#ifdef RT
+ subl KK, 4, KK
+#endif
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L40:
+ and N, 2, J
+ ble J, $L80
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ addl LDC, LDC, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ fclr t1
+#ifndef RT
+ addl C2, LDC, C
+#endif
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L60
+ .align 4
+
+$L51:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ ldi BO, 2 * SIZE(B)
+ ldi AO, 4 * SIZE(AO)
+
+ ble KK, $L58
+
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+ ldi BO, 2 * SIZE(BO)
+ ldi AO, 4 * SIZE(AO)
+
+ ble TMP1, $L58
+
+ ble L, $L55
+#endif
+ .align 4
+
+$L52:
+ ADD c05, t1, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c06, t2, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD c05, t1, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD c06, t2, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c08, t4, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD c05, t1, c05
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD c06, t2, c06
+ MUL a2, b1, t2
+ ADD c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c05, t1, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L57:
+ ADD c06, t2, c06
+ MUL a2, b1, t2
+ ADD c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD c08, t4, c08
+ MUL a4, b1, t4
+ ADD c01, t1, c01
+ MUL a1, b2, t1
+
+ ADD c02, t2, c02
+ MUL a2, b2, t2
+ ADD c03, t3, c03
+ MUL a3, b2, t3
+
+ ADD c04, t4, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c05, t1, c05
+ ADD c06, t2, c06
+ ADD c07, t3, c07
+ ADD c08, t4, c08
+ .align 4
+
+$L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c02, c02
+ SUB a4, c06, c06
+
+ SUB b1, c03, c03
+ SUB b2, c07, c07
+ SUB b3, c04, c04
+ SUB b4, c08, c08
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c05, c05
+ SUB b2, c06, c06
+ SUB b3, c07, c07
+ SUB b4, c08, c08
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a1, c08, c08
+
+ MUL a2, c04, t1
+ MUL a2, c08, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL a3, c04, t1
+ MUL a3, c08, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a4, c04, t1
+ MUL a4, c08, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b1, c07, c07
+
+ MUL b2, c03, t1
+ MUL b2, c07, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL b3, c03, t1
+ MUL b3, c07, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a3, c01, t1
+ MUL a3, c05, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL a4, c01, t1
+ MUL a4, c05, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b1, c06, c06
+
+ MUL b2, c02, t1
+ MUL b2, c06, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL b3, c02, t1
+ MUL b3, c06, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a1, c07, c07
+
+ MUL a2, c03, t1
+ MUL a2, c07, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ MUL a3, c04, c04
+ MUL a3, c08, c08
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+ MUL a2, c03, t3
+ MUL a2, c04, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a3, c05, c05
+ MUL a3, c06, c06
+ MUL a3, c07, c07
+ MUL a3, c08, c08
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+ MUL a1, c07, c07
+ MUL a1, c08, c08
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+ MUL a2, c07, t3
+ MUL a2, c08, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+ MUL a3, c03, c03
+ MUL a3, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c07, 5 * SIZE(BO)
+ ST c04, 6 * SIZE(BO)
+ ST c08, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L51
+ .align 4
+
+$L60:
+ and M, 2, I
+ ble I, $L70
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L68
+
+ ble L, $L65
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L68
+
+ ble L, $L65
+#endif
+ .align 4
+
+$L62:
+ ADD c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c02, t2, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L62
+ .align 4
+
+$L65:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L67
+#else
+ blbs TMP1, $L67
+#endif
+ .align 4
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t3, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L67:
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ ADD c05, t3, c05
+ MUL a1, b2, t3
+
+ ADD c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c05, t3, c05
+ ADD c06, t4, c06
+ .align 4
+
+$L68:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c02, c02
+ SUB a4, c06, c06
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c05, c05
+ SUB a4, c06, c06
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a3, c02, c02
+ MUL a3, c06, c06
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a3, c05, c05
+ MUL a3, c06, c06
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L70:
+ and M, 1, I
+ ble I, $L79
+
+#if defined(LT) || defined(RN)
+
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ fclr c02
+ LD b2, 1 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L78
+
+ ble L, $L75
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ fclr c02
+ LD b2, 1 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L78
+
+ ble L, $L75
+#endif
+ .align 4
+
+$L72:
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ LD a1, 1 * SIZE(AO)
+ LD b2, 3 * SIZE(BO)
+
+ ADD c02, t3, c02
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b3, t3
+ LD b3, 4 * SIZE(BO)
+
+ ADD c06, t4, c06
+ MUL a2, b4, t4
+ LD a2, 0 * SIZE(AO)
+ LD b4, 5 * SIZE(BO)
+
+ ldi BO, 4 * SIZE(BO)
+ unop
+ unop
+ bgt L, $L72
+ .align 4
+
+$L75:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L77
+#else
+ blbs TMP1, $L77
+#endif
+ .align 4
+
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L77:
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ ADD c02, t3, c02
+ ADD c06, t4, c06
+
+ ADD c01, c02, c01
+ ldi AO, 1 * SIZE(AO)
+ ADD c05, c06, c05
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c05, t2, c05
+
+ .align 4
+
+$L78:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c05, t1, c05
+ MUL a3, c05, c05
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a2, c05, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L79:
+#ifdef LN
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L80:
+ and N, 1, J
+ ble J, $L999
+
+#ifdef RT
+ sll K, BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C
+#endif
+
+ mov C, C1
+#ifndef RT
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ ble I, $L100
+ .align 4
+
+$L91:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L95
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L95
+#endif
+ .align 5
+
+$L92:
+ ADD c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi L, -1(L)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 8 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 9 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 10 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 11 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ LD a1, 12 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD a2, 13 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b3, t3
+ LD a3, 14 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b3, t4
+ LD a5, 15 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c01, t1, c01
+ MUL a1, b4, t1
+ LD a1, 16 * SIZE(AO)
+ ldi AO, 16 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b4, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L92
+ .align 4
+
+$L95:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ unop
+ ble L, $L98
+ .align 4
+
+$L96:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 1 * SIZE(BO)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ldi AO, 4 * SIZE(AO)
+ bgt L, $L96
+ .align 4
+
+$L98:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a2, c04, t1
+ SUB c03, t1, c03
+ MUL a3, c04, t1
+ SUB c02, t1, c02
+ MUL a4, c04, t1
+ SUB c01, t1, c01
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b2, c03, t1
+ SUB c02, t1, c02
+ MUL b3, c03, t1
+ SUB c01, t1, c01
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a2, c02, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c02, t1, c02
+ MUL a3, c01, t1
+ SUB c03, t1, c03
+ MUL a4, c01, t1
+ SUB c04, t1, c04
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b2, c02, t1
+ SUB c03, t1, c03
+ MUL b3, c02, t1
+ SUB c04, t1, c04
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a2, c03, t1
+ SUB c04, t1, c04
+ MUL a3, c04, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L91
+ .align 4
+
+$L100:
+ and M, 2, I
+ ble I, $L110
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L105
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ ble L, $L105
+#endif
+ .align 5
+
+$L102:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c03, t3, c03
+ ldi BO, 4 * SIZE(BO)
+ MUL a3, b2, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a5, 7 * SIZE(AO)
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, c01
+ MUL a1, b3, t1
+ LD a1, 8 * SIZE(AO)
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L102
+ .align 4
+
+$L105:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L108
+ .align 4
+
+$L106:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 2 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ LD a2, 3 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi AO, 2 * SIZE(AO)
+ unop
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L106
+ .align 4
+
+$L108:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+ ADD c01, c03, c01
+ ADD c02, c04, c02
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a2, c02, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c02, t1, c02
+ MUL a3, c02, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L110:
+ and M, 1, I
+ ble I, $L119
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ unop
+ ble L, $L115
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L115
+#endif
+ .align 4
+
+$L112:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c02, t2, c02
+ MUL a2, b2, t2
+ LD a2, 5 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c03, t3, c03
+ MUL a3, b3, t3
+ LD a3, 6 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c04, t4, c04
+ MUL a4, b4, t4
+ LD a4, 7 * SIZE(AO)
+ LD b4, 7 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 4 * SIZE(AO)
+ ldi BO, 4 * SIZE(BO)
+ bgt L, $L112
+ .align 4
+
+$L115:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L118
+ .align 4
+
+$L116:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+ LD a1, 1 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 1 * SIZE(AO)
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L116
+ .align 4
+
+$L118:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+ ADD c01, c02, c01
+ ADD c03, c04, c03
+ ADD c01, c03, c01
+
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+
+ SUB a1, c01, c01
+#else
+ LD a1, 0 * SIZE(AO)
+
+ SUB a1, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 1 * SIZE(C1)
+#endif
+
+#ifdef RT
+ SXADDQ K, AORIG, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L119:
+#ifdef LN
+ SXADDQ K, B, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S b/kernel/sw_64/trsm_kernel_4x4_RT.S
new file mode 100644
index 0000000..b9a1975
--- /dev/null
+++ b/kernel/sw_64/trsm_kernel_4x4_RT.S
@@ -0,0 +1,5148 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 56
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+#define STACKSIZE 88
+
+#define M $16
+#define N $17
+#define K $18
+#define A $20
+#define B $21
+#define C $22
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+#define C3 $25
+#define C4 $27
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define tmp $9
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl C, 0 + STACKSIZE($sp)
+ ldl LDC, 8 + STACKSIZE($sp)
+ ldl OFFSET, 16 + STACKSIZE($sp)
+
+ SXADDQ LDC, 0, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ stl $9, 64($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ mulq M, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ M, C, C
+#endif
+
+#ifdef RN
+ negq OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ and N, 1, J
+ ble J, $L40
+
+#ifdef RT
+ sll K, BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C
+#endif
+
+ mov C, C1
+#ifndef RT
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ ble I, $L100
+ .align 4
+
+$L91:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L95
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L95
+#endif
+ .align 5
+
+$L92:
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi L, -1(L)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 8 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 9 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 10 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a4, 11 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD a1, 12 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD a2, 13 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b3, b5
+ fmov b5, t3
+ LD a3, 14 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b3, b5
+ fmov b5, t4
+ LD a5, 15 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b4, b5
+ fmov b5, t1
+ LD a1, 16 * SIZE(AO)
+ ldi AO, 16 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b4, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L92
+ .align 4
+
+$L95:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ unop
+ ble L, $L98
+ .align 4
+
+$L96:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 1 * SIZE(BO)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ldi AO, 4 * SIZE(AO)
+ bgt L, $L96
+ .align 4
+
+$L98:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a2, c04, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a4, c04, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b2, c03, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL b3, c03, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a2, c02, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a3, c01, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL a4, c01, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b2, c02, b5
+ fmov b5, t1
+ SUB c03, t1, b5
+ fmov b5, c03
+ MUL b3, c02, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a2, c03, b5
+ fmov b5, t1
+ SUB c04, t1, b5
+ fmov b5, c04
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L91
+ .align 4
+
+$L100:
+ and M, 2, I
+ ble I, $L110
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L105
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ ble L, $L105
+#endif
+ .align 5
+
+$L102:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ ldi BO, 4 * SIZE(BO)
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a5, 7 * SIZE(AO)
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD a1, 8 * SIZE(AO)
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L102
+ .align 4
+
+$L105:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L108
+ .align 4
+
+$L106:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -1(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 2 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD a2, 3 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi AO, 2 * SIZE(AO)
+ unop
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L106
+ .align 4
+
+$L108:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+ ADD c01, c03, b5
+ fmov b5, c01
+ ADD c02, c04, b5
+ fmov b5, c02
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a2, c02, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c02, t1, b5
+ fmov b5, c02
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L110:
+ and M, 1, I
+ ble I, $L119
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ unop
+ ble L, $L115
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L115
+#endif
+ .align 4
+
+$L112:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 4 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 5 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ MUL a3, b3, b5
+ fmov b5, t3
+ LD a3, 6 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b4, b5
+ fmov b5, t4
+ LD a4, 7 * SIZE(AO)
+ LD b4, 7 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 4 * SIZE(AO)
+ ldi BO, 4 * SIZE(BO)
+ bgt L, $L112
+ .align 4
+
+$L115:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L118
+ .align 4
+
+$L116:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD a1, 1 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 1 * SIZE(AO)
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L116
+ .align 4
+
+$L118:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c03, t3, b5
+ fmov b5, c03
+ ADD c04, t4, b5
+ fmov b5, c04
+
+ ADD c01, c02, b5
+ fmov b5, c01
+ ADD c03, c04, b5
+ fmov b5, c03
+ ADD c01, c03, b5
+ fmov b5, c01
+
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+#else
+ LD a1, 0 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 1 * SIZE(C1)
+#endif
+
+#ifdef RT
+ SXADDQ K, AORIG, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L119:
+#ifdef LN
+ SXADDQ K, B, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L40:
+ and N, 2, J
+ ble J, $L80
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ addl LDC, LDC, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ fclr t1
+#ifndef RT
+ addl C2, LDC, C
+#endif
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L60
+ .align 4
+
+$L51:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ ldi BO, 2 * SIZE(B)
+ ldi AO, 4 * SIZE(AO)
+
+ ble KK, $L58
+
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+ ldi BO, 2 * SIZE(BO)
+ ldi AO, 4 * SIZE(AO)
+
+ ble TMP1, $L58
+
+ ble L, $L55
+#endif
+ .align 4
+
+$L52:
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ unop
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ ldi L, -2(L)
+ MUL a2, b1, b5
+ fmov b5, t2
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b1, b5
+ fmov b5, t3
+ unop
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ unop
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ unop
+
+ ADD c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b3, b5
+ fmov b5, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a5, b3, b5
+ fmov b5, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b4, b5
+ fmov b5, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b4, b5
+ fmov b5, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, b5
+ fmov b5, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD c05, t1, b5
+ fmov b5, c05
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, b5
+ fmov b5, t3
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, b5
+ fmov b5, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, b5
+ fmov b5, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, b5
+ fmov b5, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, b5
+ fmov b5, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L57:
+ ADD c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, b5
+ fmov b5, t3
+
+ ADD c08, t4, b5
+ fmov b5, c08
+ MUL a4, b1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b2, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, b5
+ fmov b5, t2
+ ADD c03, t3, b5
+ fmov b5, c03
+ MUL a3, b2, b5
+ fmov b5, t3
+
+ ADD c04, t4, b5
+ fmov b5, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, b5
+ fmov b5, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c05, t1, b5
+ fmov b5, c05
+ ADD c06, t2, b5
+ fmov b5, c06
+ ADD c07, t3, b5
+ fmov b5, c07
+ ADD c08, t4, b5
+ fmov b5, c08
+ .align 4
+
+$L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c02, b5
+ fmov b5, c02
+ SUB a4, c06, b5
+ fmov b5, c06
+
+ SUB b1, c03, b5
+ fmov b5, c03
+ SUB b2, c07, b5
+ fmov b5, c07
+ SUB b3, c04, b5
+ fmov b5, c04
+ SUB b4, c08, b5
+ fmov b5, c08
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c05, b5
+ fmov b5, c05
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c07, b5
+ fmov b5, c07
+ SUB b4, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c04, b5
+ fmov b5, t1
+ MUL a2, c08, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL a3, c04, b5
+ fmov b5, t1
+ MUL a3, c08, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a4, c04, b5
+ fmov b5, t1
+ MUL a4, c08, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b1, c07, b5
+ fmov b5, c07
+
+ MUL b2, c03, b5
+ fmov b5, t1
+ MUL b2, c07, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL b3, c03, b5
+ fmov b5, t1
+ MUL b3, c07, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c05, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c05, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b1, c06, b5
+ fmov b5, c06
+
+ MUL b2, c02, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+
+ MUL b3, c02, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c07, b5
+ fmov b5, c07
+
+ MUL a2, c03, b5
+ fmov b5, t1
+ MUL a2, c07, b5
+ fmov b5, t2
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+
+ MUL a3, c04, b5
+ fmov b5, c04
+ MUL a3, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+ MUL a2, c03, b5
+ fmov b5, t3
+ MUL a2, c04, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c06, b5
+ fmov b5, c06
+ MUL a3, c07, b5
+ fmov b5, c07
+ MUL a3, c08, b5
+ fmov b5, c08
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c07, b5
+ fmov b5, t3
+ MUL a2, c08, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c03, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c07, 5 * SIZE(BO)
+ ST c04, 6 * SIZE(BO)
+ ST c08, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L51
+ .align 4
+
+$L60:
+ and M, 2, I
+ ble I, $L70
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L68
+
+ ble L, $L65
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L68
+
+ ble L, $L65
+#endif
+ .align 4
+
+$L62:
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, b5
+ fmov b5, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ ldi L, -2(L)
+ MUL a1, b2, b5
+ fmov b5, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, b5
+ fmov b5, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a3, b4, b5
+ fmov b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a4, b4, b5
+ fmov b5, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L62
+ .align 4
+
+$L65:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L67
+#else
+ blbs TMP1, $L67
+#endif
+ .align 4
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, b5
+ fmov b5, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L67:
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c05, t3, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t3
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, b5
+ fmov b5, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c02, t2, b5
+ fmov b5, c02
+ ADD c05, t3, b5
+ fmov b5, c05
+ ADD c06, t4, b5
+ fmov b5, c06
+ .align 4
+
+$L68:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c02, b5
+ fmov b5, c02
+ SUB a4, c06, b5
+ fmov b5, c06
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c05, b5
+ fmov b5, c05
+ SUB a4, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c06, b5
+ fmov b5, c06
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L70:
+ and M, 1, I
+ ble I, $L79
+
+#if defined(LT) || defined(RN)
+
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ fclr c02
+ LD b2, 1 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L78
+
+ ble L, $L75
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ fclr c02
+ LD b2, 1 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L78
+
+ ble L, $L75
+#endif
+ .align 4
+
+$L72:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a1, b1, b5
+ fmov b5, t1
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD a1, 1 * SIZE(AO)
+ LD b2, 3 * SIZE(BO)
+
+ ADD c02, t3, b5
+ fmov b5, c02
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b3, b5
+ fmov b5, t3
+ LD b3, 4 * SIZE(BO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, b5
+ fmov b5, t4
+ LD a2, 0 * SIZE(AO)
+ LD b4, 5 * SIZE(BO)
+
+ ldi BO, 4 * SIZE(BO)
+ unop
+ unop
+ bgt L, $L72
+ .align 4
+
+$L75:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L77
+#else
+ blbs TMP1, $L77
+#endif
+ .align 4
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L77:
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ ADD c02, t3, b5
+ fmov b5, c02
+ ADD c06, t4, b5
+ fmov b5, c06
+
+ ADD c01, c02, b5
+ fmov b5, c01
+ ldi AO, 1 * SIZE(AO)
+ ADD c05, c06, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c05, t2, b5
+ fmov b5, c05
+
+ .align 4
+
+$L78:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a3, c05, b5
+ fmov b5, c05
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a2, c05, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L79:
+#ifdef LN
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L80:
+ sra N, 2, J
+ ble J, $L999
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ s4addl LDC, 0, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C3
+#ifndef RT
+ s4addl LDC, C, C
+#endif
+
+ fclr t1
+ addl C3, LDC, C4
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+
+ LD b3, 2 * SIZE(B)
+ fclr c06
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(KK)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ flds $f31, 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(B)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble KK, $L18
+#else
+
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+
+ LD b3, 2 * SIZE(BO)
+ fclr c06
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(TMP1)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(BO)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble TMP1, $L18
+#endif
+
+ ble L, $L15
+ .align 5
+
+$L12:
+/* 1 */
+ ADD c11, t1, b5
+ fmov b5, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, b5
+ fmov b5, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+ FIMOVD b5, tmp
+
+/* 2 */
+ ADD c01, t1, b5
+ fmov b5, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD c11, t1, b5
+ fmov b5, c11
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ ldi L, -2(L)
+ IFMOVD tmp, b5
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a6, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a4, t2
+ unop
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD c03, t1, b5
+ fmov b5, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD c11, t1, b5
+ fmov b5, c11
+ MUL b1, a1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, b5
+ fmov b5, t2
+ ADD c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, b5
+ fmov b5, t3
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL b1, a4, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, b5
+ fmov b5, t3
+ ADD c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, b5
+ fmov b5, t4
+
+ ADD c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, b5
+ fmov b5, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, b5
+ fmov b5, t2
+ unop
+
+ ADD c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, b5
+ fmov b5, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, b5
+ fmov b5, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, b5
+ fmov b5, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, b5
+ fmov b5, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, b5
+ fmov b5, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD c11, t1, b5
+ fmov b5, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, b5
+ fmov b5, t2
+ ADD c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, b5
+ fmov b5, t3
+
+ ADD c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL b1, a4, b5
+ fmov b5, t2
+ ADD c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, b5
+ fmov b5, t3
+
+ ADD c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, b5
+ fmov b5, t4
+ ADD c03, t1, b5
+ fmov b5, c03
+ MUL b3, a1, b5
+ fmov b5, t1
+
+ ADD c04, t2, b5
+ fmov b5, c04
+ MUL b3, a2, b5
+ fmov b5, t2
+ ADD c08, t3, b5
+ fmov b5, c08
+ MUL b4, a2, b5
+ fmov b5, t3
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL b2, a3, b5
+ fmov b5, t4
+ ADD c09, t1, b5
+ fmov b5, c09
+ MUL b3, a3, b5
+ fmov b5, t1
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ MUL b3, a4, b5
+ fmov b5, t2
+ ADD c14, t3, b5
+ fmov b5, c14
+ MUL b4, a4, b5
+ fmov b5, t3
+
+ ADD c07, t4, b5
+ fmov b5, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c11, t1, b5
+ fmov b5, c11
+ ADD c12, t2, b5
+ fmov b5, c12
+ ADD c16, t3, b5
+ fmov b5, c16
+ ADD c15, t4, b5
+ fmov b5, c15
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+
+ SUB b1, c02, b5
+ fmov b5, c02
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c10, b5
+ fmov b5, c10
+ SUB b4, c14, b5
+ fmov b5, c14
+
+ LD a1, 8 * SIZE(BO)
+ LD a2, 9 * SIZE(BO)
+ LD a3, 10 * SIZE(BO)
+ LD a4, 11 * SIZE(BO)
+
+ LD b1, 12 * SIZE(BO)
+ LD b2, 13 * SIZE(BO)
+ LD b3, 14 * SIZE(BO)
+ LD b4, 15 * SIZE(BO)
+
+ SUB a1, c03, b5
+ fmov b5, c03
+ SUB a2, c07, b5
+ fmov b5, c07
+ SUB a3, c11, b5
+ fmov b5, c11
+ SUB a4, c15, b5
+ fmov b5, c15
+
+ SUB b1, c04, b5
+ fmov b5, c04
+ SUB b2, c08, b5
+ fmov b5, c08
+ SUB b3, c12, b5
+ fmov b5, c12
+ SUB b4, c16, b5
+ fmov b5, c16
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c05, b5
+ fmov b5, c05
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c07, b5
+ fmov b5, c07
+ SUB b4, c08, b5
+ fmov b5, c08
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 12 * SIZE(AO)
+ LD b2, 13 * SIZE(AO)
+ LD b3, 14 * SIZE(AO)
+ LD b4, 15 * SIZE(AO)
+
+ SUB a1, c09, b5
+ fmov b5, c09
+ SUB a2, c10, b5
+ fmov b5, c10
+ SUB a3, c11, b5
+ fmov b5, c11
+ SUB a4, c12, b5
+ fmov b5, c12
+
+ SUB b1, c13, b5
+ fmov b5, c13
+ SUB b2, c14, b5
+ fmov b5, c14
+ SUB b3, c15, b5
+ fmov b5, c15
+ SUB b4, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c08, b5
+ fmov b5, c08
+ MUL a1, c12, b5
+ fmov b5, c12
+ MUL a1, c16, b5
+ fmov b5, c16
+
+ MUL a2, c04, b5
+ fmov b5, t1
+ MUL a2, c08, b5
+ fmov b5, t2
+ MUL a2, c12, b5
+ fmov b5, t3
+ MUL a2, c16, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL a3, c04, b5
+ fmov b5, t1
+ MUL a3, c08, b5
+ fmov b5, t2
+ MUL a3, c12, b5
+ fmov b5, t3
+ MUL a3, c16, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a4, c04, b5
+ fmov b5, t1
+ MUL a4, c08, b5
+ fmov b5, t2
+ MUL a4, c12, b5
+ fmov b5, t3
+ MUL a4, c16, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, b5
+ fmov b5, c03
+ MUL b1, c07, b5
+ fmov b5, c07
+ MUL b1, c11, b5
+ fmov b5, c11
+ MUL b1, c15, b5
+ fmov b5, c15
+
+ MUL b2, c03, b5
+ fmov b5, t1
+ MUL b2, c07, b5
+ fmov b5, t2
+ MUL b2, c11, b5
+ fmov b5, t3
+ MUL b2, c15, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL b3, c03, b5
+ fmov b5, t1
+ MUL b3, c07, b5
+ fmov b5, t2
+ MUL b3, c11, b5
+ fmov b5, t3
+ MUL b3, c15, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c10, b5
+ fmov b5, t3
+ MUL a2, c14, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c09, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+ MUL a2, c09, b5
+ fmov b5, t3
+ MUL a2, c13, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c05, b5
+ fmov b5, t2
+ MUL a3, c09, b5
+ fmov b5, t3
+ MUL a3, c13, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c05, b5
+ fmov b5, t2
+ MUL a4, c09, b5
+ fmov b5, t3
+ MUL a4, c13, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, b5
+ fmov b5, c02
+ MUL b1, c06, b5
+ fmov b5, c06
+ MUL b1, c10, b5
+ fmov b5, c10
+ MUL b1, c14, b5
+ fmov b5, c14
+
+ MUL b2, c02, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+ MUL b2, c10, b5
+ fmov b5, t3
+ MUL b2, c14, b5
+ fmov b5, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c07, t2, b5
+ fmov b5, c07
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c15, t4, b5
+ fmov b5, c15
+
+ MUL b3, c02, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+ MUL b3, c10, b5
+ fmov b5, t3
+ MUL b3, c14, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c15, b5
+ fmov b5, c15
+
+ MUL a2, c03, b5
+ fmov b5, t1
+ MUL a2, c07, b5
+ fmov b5, t2
+ MUL a2, c11, b5
+ fmov b5, t3
+ MUL a2, c15, b5
+ fmov b5, t4
+
+ SUB c04, t1, b5
+ fmov b5, c04
+ SUB c08, t2, b5
+ fmov b5, c08
+ SUB c12, t3, b5
+ fmov b5, c12
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ MUL a3, c04, b5
+ fmov b5, c04
+ MUL a3, c08, b5
+ fmov b5, c08
+ MUL a3, c12, b5
+ fmov b5, c12
+ MUL a3, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+ MUL a2, c03, b5
+ fmov b5, t3
+ MUL a2, c04, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c02, b5
+ fmov b5, t2
+ MUL a3, c03, b5
+ fmov b5, t3
+ MUL a3, c04, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c02, b5
+ fmov b5, t2
+ MUL a4, c03, b5
+ fmov b5, t3
+ MUL a4, c04, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b1, c06, b5
+ fmov b5, c06
+ MUL b1, c07, b5
+ fmov b5, c07
+ MUL b1, c08, b5
+ fmov b5, c08
+
+ MUL b2, c05, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+ MUL b2, c07, b5
+ fmov b5, t3
+ MUL b2, c08, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL b3, c05, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+ MUL b3, c07, b5
+ fmov b5, t3
+ MUL b3, c08, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ MUL a2, c09, b5
+ fmov b5, t1
+ MUL a2, c10, b5
+ fmov b5, t2
+ MUL a2, c11, b5
+ fmov b5, t3
+ MUL a2, c12, b5
+ fmov b5, t4
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+ SUB c15, t3, b5
+ fmov b5, c15
+ SUB c16, t4, b5
+ fmov b5, c16
+
+ MUL a3, c13, b5
+ fmov b5, c13
+ MUL a3, c14, b5
+ fmov b5, c14
+ MUL a3, c15, b5
+ fmov b5, c15
+ MUL a3, c16, b5
+ fmov b5, c16
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a1, c14, b5
+ fmov b5, c14
+ MUL a1, c15, b5
+ fmov b5, c15
+ MUL a1, c16, b5
+ fmov b5, c16
+
+ MUL a2, c13, b5
+ fmov b5, t1
+ MUL a2, c14, b5
+ fmov b5, t2
+ MUL a2, c15, b5
+ fmov b5, t3
+ MUL a2, c16, b5
+ fmov b5, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c13, b5
+ fmov b5, t1
+ MUL a3, c14, b5
+ fmov b5, t2
+ MUL a3, c15, b5
+ fmov b5, t3
+ MUL a3, c16, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL a4, c13, b5
+ fmov b5, t1
+ MUL a4, c14, b5
+ fmov b5, t2
+ MUL a4, c15, b5
+ fmov b5, t3
+ MUL a4, c16, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b1, c10, b5
+ fmov b5, c10
+ MUL b1, c11, b5
+ fmov b5, c11
+ MUL b1, c12, b5
+ fmov b5, c12
+
+ MUL b2, c09, b5
+ fmov b5, t1
+ MUL b2, c10, b5
+ fmov b5, t2
+ MUL b2, c11, b5
+ fmov b5, t3
+ MUL b2, c12, b5
+ fmov b5, t4
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c07, t3, b5
+ fmov b5, c07
+ SUB c08, t4, b5
+ fmov b5, c08
+
+ MUL b3, c09, b5
+ fmov b5, t1
+ MUL b3, c10, b5
+ fmov b5, t2
+ MUL b3, c11, b5
+ fmov b5, t3
+ MUL b3, c12, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c07, b5
+ fmov b5, c07
+ MUL a1, c08, b5
+ fmov b5, c08
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c07, b5
+ fmov b5, t3
+ MUL a2, c08, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c03, b5
+ fmov b5, c03
+ MUL a3, c04, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+
+ ST c03, 8 * SIZE(BO)
+ ST c07, 9 * SIZE(BO)
+ ST c11, 10 * SIZE(BO)
+ ST c15, 11 * SIZE(BO)
+
+ ST c04, 12 * SIZE(BO)
+ ST c08, 13 * SIZE(BO)
+ ST c12, 14 * SIZE(BO)
+ ST c16, 15 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+
+ ST c09, 8 * SIZE(AO)
+ ST c10, 9 * SIZE(AO)
+ ST c11, 10 * SIZE(AO)
+ ST c12, 11 * SIZE(AO)
+
+ ST c13, 12 * SIZE(AO)
+ ST c14, 13 * SIZE(AO)
+ ST c15, 14 * SIZE(AO)
+ ST c16, 15 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+ ldi C3, -4 * SIZE(C3)
+ ldi C4, -4 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c11, 2 * SIZE(C3)
+ ST c12, 3 * SIZE(C3)
+
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+ ST c15, 2 * SIZE(C4)
+ ST c16, 3 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+ ldi C3, 4 * SIZE(C3)
+ ldi C4, 4 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 2, I
+ ble I, $L30
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c01
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ ldi BO, 4 * SIZE(B)
+ fclr c02
+ fclr c06
+ ble KK, $L28
+
+ ble L, $L25
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c01
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c02
+ fclr c06
+ ble TMP1, $L28
+
+ ble L, $L25
+#endif
+ .align 4
+
+$L22:
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+ FIMOVD b5, tmp
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ IFMOVD tmp, b5
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ IFMOVD tmp, b5
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+
+$L25:
+ ADD c09, t1, b5
+ fmov b5, c09
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+
+ ADD c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, b5
+ fmov b5, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, b5
+ fmov b5, t3
+ unop
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, b5
+ fmov b5, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, b5
+ fmov b5, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, b5
+ fmov b5, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, b5
+ fmov b5, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b4, b5
+ fmov b5, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD c10, t2, b5
+ fmov b5, c10
+ MUL a2, b1, b5
+ fmov b5, t2
+ ADD c13, t3, b5
+ fmov b5, c13
+ MUL a1, b2, b5
+ fmov b5, t3
+
+ ADD c14, t4, b5
+ fmov b5, c14
+ MUL a2, b2, b5
+ fmov b5, t4
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, b5
+ fmov b5, t1
+
+ ADD c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, b5
+ fmov b5, t2
+ ADD c05, t3, b5
+ fmov b5, c05
+ MUL a1, b4, b5
+ fmov b5, t3
+
+ ADD c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c09, t1, b5
+ fmov b5, c09
+ ADD c10, t2, b5
+ fmov b5, c10
+ ADD c13, t3, b5
+ fmov b5, c13
+ ADD c14, t4, b5
+ fmov b5, c14
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+
+ SUB b1, c02, b5
+ fmov b5, c02
+ SUB b2, c06, b5
+ fmov b5, c06
+ SUB b3, c10, b5
+ fmov b5, c10
+ SUB b4, c14, b5
+ fmov b5, c14
+
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c05, b5
+ fmov b5, c05
+ SUB a4, c06, b5
+ fmov b5, c06
+
+ SUB b1, c09, b5
+ fmov b5, c09
+ SUB b2, c10, b5
+ fmov b5, c10
+ SUB b3, c13, b5
+ fmov b5, c13
+ SUB b4, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c06, b5
+ fmov b5, c06
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c02, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+ MUL a2, c10, b5
+ fmov b5, t3
+ MUL a2, c14, b5
+ fmov b5, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c05, t2, b5
+ fmov b5, c05
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c13, t4, b5
+ fmov b5, c13
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c05, b5
+ fmov b5, c05
+ MUL a3, c09, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c05, b5
+ fmov b5, t2
+ MUL a2, c09, b5
+ fmov b5, t3
+ MUL a2, c13, b5
+ fmov b5, t4
+
+ SUB c02, t1, b5
+ fmov b5, c02
+ SUB c06, t2, b5
+ fmov b5, c06
+ SUB c10, t3, b5
+ fmov b5, c10
+ SUB c14, t4, b5
+ fmov b5, c14
+
+ MUL a3, c02, b5
+ fmov b5, c02
+ MUL a3, c06, b5
+ fmov b5, c06
+ MUL a3, c10, b5
+ fmov b5, c10
+ MUL a3, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ MUL a2, c01, b5
+ fmov b5, t1
+ MUL a2, c02, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a3, c01, b5
+ fmov b5, t1
+ MUL a3, c02, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a4, c01, b5
+ fmov b5, t1
+ MUL a4, c02, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b1, c06, b5
+ fmov b5, c06
+
+ MUL b2, c05, b5
+ fmov b5, t1
+ MUL b2, c06, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL b3, c05, b5
+ fmov b5, t1
+ MUL b3, c06, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ MUL a2, c09, b5
+ fmov b5, t1
+ MUL a2, c10, b5
+ fmov b5, t2
+
+ SUB c13, t1, b5
+ fmov b5, c13
+ SUB c14, t2, b5
+ fmov b5, c14
+
+ MUL a3, c13, b5
+ fmov b5, c13
+ MUL a3, c14, b5
+ fmov b5, c14
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a1, c14, b5
+ fmov b5, c14
+
+ MUL a2, c13, b5
+ fmov b5, t1
+ MUL a2, c14, b5
+ fmov b5, t2
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a3, c13, b5
+ fmov b5, t1
+ MUL a3, c14, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL a4, c13, b5
+ fmov b5, t1
+ MUL a4, c14, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b1, c10, b5
+ fmov b5, c10
+
+ MUL b2, c09, b5
+ fmov b5, t1
+ MUL b2, c10, b5
+ fmov b5, t2
+
+ SUB c05, t1, b5
+ fmov b5, c05
+ SUB c06, t2, b5
+ fmov b5, c06
+
+ MUL b3, c09, b5
+ fmov b5, t1
+ MUL b3, c10, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c06, b5
+ fmov b5, c06
+
+ MUL a2, c05, b5
+ fmov b5, t1
+ MUL a2, c06, b5
+ fmov b5, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, b5
+ fmov b5, c01
+ MUL a3, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c13, 6 * SIZE(AO)
+ ST c14, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+ ldi C3, -2 * SIZE(C3)
+ ldi C4, -2 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+ ldi C3, 2 * SIZE(C3)
+ ldi C4, 2 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L30:
+ and M, 1, I
+ ble I, $L39
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c09
+ LD b4, 3 * SIZE(B)
+ fclr c13
+
+ ldi BO, 4 * SIZE(B)
+ ble KK, $L38
+
+ ble L, $L35
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c09
+ LD b4, 3 * SIZE(BO)
+ fclr c13
+
+ ldi BO, 4 * SIZE(BO)
+ ble TMP1, $L38
+
+ ble L, $L35
+#endif
+ .align 4
+
+$L32:
+ ADD c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ ldi AO, 2 * SIZE(AO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ LD b5, 3 * SIZE(BO)
+ FIMOVD b5, tmp
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL a1, b4, t4
+ LD a1, -1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a2, b1, t1
+ LD b1, 4 * SIZE(BO)
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a2, b2, t2
+ LD b2, -3 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ LD b4, -1 * SIZE(BO)
+ MUL a2, b3, t3
+ LD b3, -2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ IFMOVD tmp, b5
+ MUL a2, b5, t4
+ LD a2, 0 * SIZE(AO)
+ bgt L, $L32
+ .align 4
+
+$L35:
+ ADD c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, b5
+ fmov b5, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L37
+#else
+ blbs TMP1, $L37
+#endif
+ .align 4
+
+ ADD c05, t2, b5
+ fmov b5, c05
+ LD b1, 0 * SIZE(BO)
+ MUL a1, b2, b5
+ fmov b5, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, b5
+ fmov b5, c09
+ MUL a1, b3, b5
+ fmov b5, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ MUL a1, b4, b5
+ fmov b5, t4
+ LD a1, 0 * SIZE(AO)
+ ldi AO, 1 * SIZE(AO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, b5
+ fmov b5, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L37:
+ ADD c05, t2, b5
+ fmov b5, c05
+ MUL a1, b2, b5
+ fmov b5, t2
+ ADD c09, t3, b5
+ fmov b5, c09
+ MUL a1, b3, b5
+ fmov b5, t3
+
+ ADD c13, t4, b5
+ fmov b5, c13
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b4, b5
+ fmov b5, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c01, t1, b5
+ fmov b5, c01
+ ADD c05, t2, b5
+ fmov b5, c05
+ ADD c09, t3, b5
+ fmov b5, c09
+ ADD c13, t4, b5
+ fmov b5, c13
+
+$L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c05, b5
+ fmov b5, c05
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c13, b5
+ fmov b5, c13
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a2, c01, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a3, c01, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL a4, c01, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, b5
+ fmov b5, c05
+ MUL b2, c05, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL b3, c05, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a2, c09, b5
+ fmov b5, t1
+ SUB c13, t1, b5
+ fmov b5, c13
+ MUL a3, c13, b5
+ fmov b5, c13
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, b5
+ fmov b5, c13
+ MUL a2, c13, b5
+ fmov b5, t1
+ SUB c09, t1, b5
+ fmov b5, c09
+ MUL a3, c13, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL a4, c13, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, b5
+ fmov b5, c09
+ MUL b2, c09, b5
+ fmov b5, t1
+ SUB c05, t1, b5
+ fmov b5, c05
+ MUL b3, c09, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, b5
+ fmov b5, c05
+ MUL a2, c05, b5
+ fmov b5, t1
+ SUB c01, t1, b5
+ fmov b5, c01
+ MUL a3, c01, b5
+ fmov b5, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c13, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+ ldi C3, -1 * SIZE(C3)
+ ldi C4, -1 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c09, 0 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L39:
+#ifdef LN
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 4, KK
+#endif
+
+#ifdef RT
+ subl KK, 4, KK
+#endif
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl $9, 64($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/trsm_kernel_4x4_RT.S.bak b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak
new file mode 100644
index 0000000..af57279
--- /dev/null
+++ b/kernel/sw_64/trsm_kernel_4x4_RT.S.bak
@@ -0,0 +1,4072 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW2B
+#define PREFETCHSIZE 56
+#define UNOP nop
+#endif
+
+#ifdef EV6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 56
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $20
+#define B $21
+#define C $22
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+#define C3 $25
+#define C4 $27
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl C, 0 + STACKSIZE($sp)
+ ldl LDC, 8 + STACKSIZE($sp)
+ ldl OFFSET, 16 + STACKSIZE($sp)
+
+ SXADDQ LDC, 0, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ mull M, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ M, C, C
+#endif
+
+#ifdef RN
+ negq OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ and N, 1, J
+ ble J, $L40
+
+#ifdef RT
+ sll K, BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C
+#endif
+
+ mov C, C1
+#ifndef RT
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ ble I, $L100
+ .align 4
+
+$L91:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L95
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L95
+#endif
+ .align 5
+
+$L92:
+ ADD c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi L, -1(L)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 8 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 9 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 10 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 11 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ LD a1, 12 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD a2, 13 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b3, t3
+ LD a3, 14 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b3, t4
+ LD a5, 15 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c01, t1, c01
+ MUL a1, b4, t1
+ LD a1, 16 * SIZE(AO)
+ ldi AO, 16 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b4, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L92
+ .align 4
+
+$L95:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ unop
+ ble L, $L98
+ .align 4
+
+$L96:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 1 * SIZE(BO)
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b1, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b1, t4
+ LD a4, 7 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ldi AO, 4 * SIZE(AO)
+ bgt L, $L96
+ .align 4
+
+$L98:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a2, c04, t1
+ SUB c03, t1, c03
+ MUL a3, c04, t1
+ SUB c02, t1, c02
+ MUL a4, c04, t1
+ SUB c01, t1, c01
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b2, c03, t1
+ SUB c02, t1, c02
+ MUL b3, c03, t1
+ SUB c01, t1, c01
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a2, c02, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c02, t1, c02
+ MUL a3, c01, t1
+ SUB c03, t1, c03
+ MUL a4, c01, t1
+ SUB c04, t1, c04
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b2, c02, t1
+ SUB c03, t1, c03
+ MUL b3, c02, t1
+ SUB c04, t1, c04
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a2, c03, t1
+ SUB c04, t1, c04
+ MUL a3, c04, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L91
+ .align 4
+
+$L100:
+ and M, 2, I
+ ble I, $L110
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ ble L, $L105
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ ble L, $L105
+#endif
+ .align 5
+
+$L102:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ LD a2, 5 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c03, t3, c03
+ ldi BO, 4 * SIZE(BO)
+ MUL a3, b2, t3
+ LD a3, 6 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a5, 7 * SIZE(AO)
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, c01
+ MUL a1, b3, t1
+ LD a1, 8 * SIZE(AO)
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, 3 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 3 * SIZE(BO)
+ bgt L, $L102
+ .align 4
+
+$L105:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L108
+ .align 4
+
+$L106:
+ ADD c01, t1, c01
+ ldi L, -1(L)
+ MUL a1, b1, t1
+ LD a1, 2 * SIZE(AO)
+
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ LD a2, 3 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi AO, 2 * SIZE(AO)
+ unop
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L106
+ .align 4
+
+$L108:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+ ADD c01, c03, c01
+ ADD c02, c04, c02
+
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a2, c02, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c02, t1, c02
+ MUL a3, c02, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L110:
+ and M, 1, I
+ ble I, $L119
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c04
+
+ sra KK, 2, L
+ mov B, BO
+ unop
+ ble L, $L115
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c04
+
+ sra TMP1, 2, L
+ unop
+ ble L, $L115
+#endif
+ .align 4
+
+$L112:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+ LD a1, 4 * SIZE(AO)
+ LD b1, 4 * SIZE(BO)
+
+ ADD c02, t2, c02
+ MUL a2, b2, t2
+ LD a2, 5 * SIZE(AO)
+ LD b2, 5 * SIZE(BO)
+
+ ADD c03, t3, c03
+ MUL a3, b3, t3
+ LD a3, 6 * SIZE(AO)
+ LD b3, 6 * SIZE(BO)
+
+ ADD c04, t4, c04
+ MUL a4, b4, t4
+ LD a4, 7 * SIZE(AO)
+ LD b4, 7 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 4 * SIZE(AO)
+ ldi BO, 4 * SIZE(BO)
+ bgt L, $L112
+ .align 4
+
+$L115:
+#if defined(LT) || defined(RN)
+ and KK, 3, L
+#else
+ and TMP1, 3, L
+#endif
+ ble L, $L118
+ .align 4
+
+$L116:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+ LD a1, 1 * SIZE(AO)
+ LD b1, 1 * SIZE(BO)
+
+ ldi L, -1(L)
+ ldi AO, 1 * SIZE(AO)
+ ldi BO, 1 * SIZE(BO)
+ bgt L, $L116
+ .align 4
+
+$L118:
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c03, t3, c03
+ ADD c04, t4, c04
+
+ ADD c01, c02, c01
+ ADD c03, c04, c03
+ ADD c01, c03, c01
+
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ addl B, TMP2, BO
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+
+ SUB a1, c01, c01
+#else
+ LD a1, 0 * SIZE(AO)
+
+ SUB a1, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+
+ MUL a1, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 1 * SIZE(C1)
+#endif
+
+#ifdef RT
+ SXADDQ K, AORIG, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L119:
+#ifdef LN
+ SXADDQ K, B, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L40:
+ and N, 2, J
+ ble J, $L80
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ addl LDC, LDC, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ fclr t1
+#ifndef RT
+ addl C2, LDC, C
+#endif
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L60
+ .align 4
+
+$L51:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ ldi BO, 2 * SIZE(B)
+ ldi AO, 4 * SIZE(AO)
+
+ ble KK, $L58
+
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c03
+ LD a2, 1 * SIZE(AO)
+ fclr c07
+ LD a3, 2 * SIZE(AO)
+ fclr c04
+ LD a4, 3 * SIZE(AO)
+ fclr c08
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+ ldi BO, 2 * SIZE(BO)
+ ldi AO, 4 * SIZE(AO)
+
+ ble TMP1, $L58
+
+ ble L, $L55
+#endif
+ .align 4
+
+$L52:
+ ADD c05, t1, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c06, t2, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD c05, t1, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD c06, t2, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD c07, t3, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD c08, t4, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD c03, t3, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD c05, t1, c05
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD c06, t2, c06
+ MUL a2, b1, t2
+ ADD c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c05, t1, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L57:
+ ADD c06, t2, c06
+ MUL a2, b1, t2
+ ADD c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD c08, t4, c08
+ MUL a4, b1, t4
+ ADD c01, t1, c01
+ MUL a1, b2, t1
+
+ ADD c02, t2, c02
+ MUL a2, b2, t2
+ ADD c03, t3, c03
+ MUL a3, b2, t3
+
+ ADD c04, t4, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c05, t1, c05
+ ADD c06, t2, c06
+ ADD c07, t3, c07
+ ADD c08, t4, c08
+ .align 4
+
+$L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c02, c02
+ SUB a4, c06, c06
+
+ SUB b1, c03, c03
+ SUB b2, c07, c07
+ SUB b3, c04, c04
+ SUB b4, c08, c08
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c05, c05
+ SUB b2, c06, c06
+ SUB b3, c07, c07
+ SUB b4, c08, c08
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a1, c08, c08
+
+ MUL a2, c04, t1
+ MUL a2, c08, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL a3, c04, t1
+ MUL a3, c08, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a4, c04, t1
+ MUL a4, c08, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b1, c07, c07
+
+ MUL b2, c03, t1
+ MUL b2, c07, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL b3, c03, t1
+ MUL b3, c07, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a3, c01, t1
+ MUL a3, c05, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL a4, c01, t1
+ MUL a4, c05, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b1, c06, c06
+
+ MUL b2, c02, t1
+ MUL b2, c06, t2
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+
+ MUL b3, c02, t1
+ MUL b3, c06, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a1, c07, c07
+
+ MUL a2, c03, t1
+ MUL a2, c07, t2
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+
+ MUL a3, c04, c04
+ MUL a3, c08, c08
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+ MUL a2, c03, t3
+ MUL a2, c04, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a3, c05, c05
+ MUL a3, c06, c06
+ MUL a3, c07, c07
+ MUL a3, c08, c08
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+ MUL a1, c07, c07
+ MUL a1, c08, c08
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+ MUL a2, c07, t3
+ MUL a2, c08, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+ MUL a3, c03, c03
+ MUL a3, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c07, 5 * SIZE(BO)
+ ST c04, 6 * SIZE(BO)
+ ST c08, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L51
+ .align 4
+
+$L60:
+ and M, 2, I
+ ble I, $L70
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L68
+
+ ble L, $L65
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L68
+
+ ble L, $L65
+#endif
+ .align 4
+
+$L62:
+ ADD c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c02, t2, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L62
+ .align 4
+
+$L65:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L67
+#else
+ blbs TMP1, $L67
+#endif
+ .align 4
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t3, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L67:
+ ADD c02, t2, c02
+ MUL a2, b1, t2
+ ADD c05, t3, c05
+ MUL a1, b2, t3
+
+ ADD c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c02, t2, c02
+ ADD c05, t3, c05
+ ADD c06, t4, c06
+ .align 4
+
+$L68:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c02, c02
+ SUB a4, c06, c06
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c05, c05
+ SUB a4, c06, c06
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+
+ MUL a3, c02, c02
+ MUL a3, c06, c06
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a3, c05, c05
+ MUL a3, c06, c06
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c02, 2 * SIZE(BO)
+ ST c06, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L70:
+ and M, 1, I
+ ble I, $L79
+
+#if defined(LT) || defined(RN)
+
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ fclr c02
+ LD b2, 1 * SIZE(B)
+ fclr c06
+
+ ldi L, -2(KK)
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 2 * SIZE(B)
+
+ ble KK, $L78
+
+ ble L, $L75
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ fclr c02
+ LD b2, 1 * SIZE(BO)
+ fclr c06
+
+ ldi L, -2(TMP1)
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 2 * SIZE(BO)
+
+ ble TMP1, $L78
+
+ ble L, $L75
+#endif
+ .align 4
+
+$L72:
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 2 * SIZE(BO)
+
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ LD a1, 1 * SIZE(AO)
+ LD b2, 3 * SIZE(BO)
+
+ ADD c02, t3, c02
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b3, t3
+ LD b3, 4 * SIZE(BO)
+
+ ADD c06, t4, c06
+ MUL a2, b4, t4
+ LD a2, 0 * SIZE(AO)
+ LD b4, 5 * SIZE(BO)
+
+ ldi BO, 4 * SIZE(BO)
+ unop
+ unop
+ bgt L, $L72
+ .align 4
+
+$L75:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L77
+#else
+ blbs TMP1, $L77
+#endif
+ .align 4
+
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ LD a1, 0 * SIZE(AO)
+ LD b1, 0 * SIZE(BO)
+
+ ADD c01, t1, c01
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L77:
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ ADD c02, t3, c02
+ ADD c06, t4, c06
+
+ ADD c01, c02, c01
+ ldi AO, 1 * SIZE(AO)
+ ADD c05, c06, c05
+ ldi BO, 2 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c05, t2, c05
+
+ .align 4
+
+$L78:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c05, t1, c05
+ MUL a3, c05, c05
+#endif
+
+#ifdef RT
+ LD a1, 3 * SIZE(BO)
+ LD a2, 2 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a2, c05, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L79:
+#ifdef LN
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L80:
+ sra N, 2, J
+ ble J, $L999
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ s4addl LDC, 0, TMP1
+ subl C, TMP1, C
+#endif
+
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C3
+#ifndef RT
+ s4addl LDC, C, C
+#endif
+
+ fclr t1
+ addl C3, LDC, C4
+ fclr t2
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 2, I
+ fclr t3
+ fclr t4
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c02
+
+ LD b3, 2 * SIZE(B)
+ fclr c06
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(KK)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(B)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble KK, $L18
+#else
+
+#ifdef LN
+ sll K, BASE_SHIFT + 2, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 2, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c11
+ LD a2, 1 * SIZE(AO)
+ fclr c12
+
+ LD a3, 2 * SIZE(AO)
+ fclr c16
+ LD a4, 3 * SIZE(AO)
+ fclr c15
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c02
+
+ LD b3, 2 * SIZE(BO)
+ fclr c06
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ fillcs 4 * SIZE(C1)
+ fclr c03
+ ldi L, -2(TMP1)
+ fclr c04
+
+ fillcs 7 * SIZE(C2)
+ fclr c08
+ ldi BO, 4 * SIZE(BO)
+ fclr c13
+
+ fillcs 4 * SIZE(C3)
+ fclr c09
+ ldi AO, 4 * SIZE(AO)
+ fclr c10
+
+ fillcs 7 * SIZE(C4)
+ fclr c14
+ fclr c07
+ ble TMP1, $L18
+#endif
+
+ ble L, $L15
+ .align 5
+
+$L12:
+/* 1 */
+ ADD c11, t1, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD c01, t1, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD c02, t2, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD c11, t1, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD c12, t2, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD c15, t4, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD c01, t1, c01
+ unop
+ MUL b5, a6, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD c05, t4, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD c03, t1, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD c04, t2, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD c09, t1, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD c11, t1, c11
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD c12, t2, c12
+ MUL b1, a2, t2
+ ADD c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD c15, t4, c15
+ MUL b2, a1, t4
+ ADD c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD c02, t2, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c06, t3, c06
+ MUL b2, a4, t3
+ ADD c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD c11, t1, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD c12, t2, c12
+ MUL b1, a2, t2
+ ADD c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD c15, t4, c15
+ MUL b2, a1, t4
+ ADD c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD c02, t2, c02
+ MUL b1, a4, t2
+ ADD c06, t3, c06
+ MUL b2, a4, t3
+
+ ADD c05, t4, c05
+ MUL b4, a1, t4
+ ADD c03, t1, c03
+ MUL b3, a1, t1
+
+ ADD c04, t2, c04
+ MUL b3, a2, t2
+ ADD c08, t3, c08
+ MUL b4, a2, t3
+
+ ADD c13, t4, c13
+ MUL b2, a3, t4
+ ADD c09, t1, c09
+ MUL b3, a3, t1
+
+ ADD c10, t2, c10
+ MUL b3, a4, t2
+ ADD c14, t3, c14
+ MUL b4, a4, t3
+
+ ADD c07, t4, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c11, t1, c11
+ ADD c12, t2, c12
+ ADD c16, t3, c16
+ ADD c15, t4, c15
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 4, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+
+ SUB b1, c02, c02
+ SUB b2, c06, c06
+ SUB b3, c10, c10
+ SUB b4, c14, c14
+
+ LD a1, 8 * SIZE(BO)
+ LD a2, 9 * SIZE(BO)
+ LD a3, 10 * SIZE(BO)
+ LD a4, 11 * SIZE(BO)
+
+ LD b1, 12 * SIZE(BO)
+ LD b2, 13 * SIZE(BO)
+ LD b3, 14 * SIZE(BO)
+ LD b4, 15 * SIZE(BO)
+
+ SUB a1, c03, c03
+ SUB a2, c07, c07
+ SUB a3, c11, c11
+ SUB a4, c15, c15
+
+ SUB b1, c04, c04
+ SUB b2, c08, c08
+ SUB b3, c12, c12
+ SUB b4, c16, c16
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c05, c05
+ SUB b2, c06, c06
+ SUB b3, c07, c07
+ SUB b4, c08, c08
+
+ LD a1, 8 * SIZE(AO)
+ LD a2, 9 * SIZE(AO)
+ LD a3, 10 * SIZE(AO)
+ LD a4, 11 * SIZE(AO)
+
+ LD b1, 12 * SIZE(AO)
+ LD b2, 13 * SIZE(AO)
+ LD b3, 14 * SIZE(AO)
+ LD b4, 15 * SIZE(AO)
+
+ SUB a1, c09, c09
+ SUB a2, c10, c10
+ SUB a3, c11, c11
+ SUB a4, c12, c12
+
+ SUB b1, c13, c13
+ SUB b2, c14, c14
+ SUB b3, c15, c15
+ SUB b4, c16, c16
+#endif
+
+#ifdef LN
+ LD a1, 15 * SIZE(AO)
+ LD a2, 14 * SIZE(AO)
+ LD a3, 13 * SIZE(AO)
+ LD a4, 12 * SIZE(AO)
+
+ MUL a1, c04, c04
+ MUL a1, c08, c08
+ MUL a1, c12, c12
+ MUL a1, c16, c16
+
+ MUL a2, c04, t1
+ MUL a2, c08, t2
+ MUL a2, c12, t3
+ MUL a2, c16, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL a3, c04, t1
+ MUL a3, c08, t2
+ MUL a3, c12, t3
+ MUL a3, c16, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a4, c04, t1
+ MUL a4, c08, t2
+ MUL a4, c12, t3
+ MUL a4, c16, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ LD b1, 10 * SIZE(AO)
+ LD b2, 9 * SIZE(AO)
+ LD b3, 8 * SIZE(AO)
+
+ MUL b1, c03, c03
+ MUL b1, c07, c07
+ MUL b1, c11, c11
+ MUL b1, c15, c15
+
+ MUL b2, c03, t1
+ MUL b2, c07, t2
+ MUL b2, c11, t3
+ MUL b2, c15, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL b3, c03, t1
+ MUL b3, c07, t2
+ MUL b3, c11, t3
+ MUL b3, c15, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ LD a1, 5 * SIZE(AO)
+ LD a2, 4 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+ MUL a1, c10, c10
+ MUL a1, c14, c14
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+ MUL a2, c10, t3
+ MUL a2, c14, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+ MUL a3, c09, c09
+ MUL a3, c13, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+ MUL a2, c09, t3
+ MUL a2, c13, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a3, c01, t1
+ MUL a3, c05, t2
+ MUL a3, c09, t3
+ MUL a3, c13, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL a4, c01, t1
+ MUL a4, c05, t2
+ MUL a4, c09, t3
+ MUL a4, c13, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ LD b1, 5 * SIZE(AO)
+ LD b2, 6 * SIZE(AO)
+ LD b3, 7 * SIZE(AO)
+
+ MUL b1, c02, c02
+ MUL b1, c06, c06
+ MUL b1, c10, c10
+ MUL b1, c14, c14
+
+ MUL b2, c02, t1
+ MUL b2, c06, t2
+ MUL b2, c10, t3
+ MUL b2, c14, t4
+
+ SUB c03, t1, c03
+ SUB c07, t2, c07
+ SUB c11, t3, c11
+ SUB c15, t4, c15
+
+ MUL b3, c02, t1
+ MUL b3, c06, t2
+ MUL b3, c10, t3
+ MUL b3, c14, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ LD a1, 10 * SIZE(AO)
+ LD a2, 11 * SIZE(AO)
+ LD a3, 15 * SIZE(AO)
+
+ MUL a1, c03, c03
+ MUL a1, c07, c07
+ MUL a1, c11, c11
+ MUL a1, c15, c15
+
+ MUL a2, c03, t1
+ MUL a2, c07, t2
+ MUL a2, c11, t3
+ MUL a2, c15, t4
+
+ SUB c04, t1, c04
+ SUB c08, t2, c08
+ SUB c12, t3, c12
+ SUB c16, t4, c16
+
+ MUL a3, c04, c04
+ MUL a3, c08, c08
+ MUL a3, c12, c12
+ MUL a3, c16, c16
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+ MUL a2, c03, t3
+ MUL a2, c04, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c01, t1
+ MUL a4, c02, t2
+ MUL a4, c03, t3
+ MUL a4, c04, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b1, c06, c06
+ MUL b1, c07, c07
+ MUL b1, c08, c08
+
+ MUL b2, c05, t1
+ MUL b2, c06, t2
+ MUL b2, c07, t3
+ MUL b2, c08, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL b3, c05, t1
+ MUL b3, c06, t2
+ MUL b3, c07, t3
+ MUL b3, c08, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ MUL a2, c09, t1
+ MUL a2, c10, t2
+ MUL a2, c11, t3
+ MUL a2, c12, t4
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+ SUB c15, t3, c15
+ SUB c16, t4, c16
+
+ MUL a3, c13, c13
+ MUL a3, c14, c14
+ MUL a3, c15, c15
+ MUL a3, c16, c16
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a1, c14, c14
+ MUL a1, c15, c15
+ MUL a1, c16, c16
+
+ MUL a2, c13, t1
+ MUL a2, c14, t2
+ MUL a2, c15, t3
+ MUL a2, c16, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a3, c13, t1
+ MUL a3, c14, t2
+ MUL a3, c15, t3
+ MUL a3, c16, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL a4, c13, t1
+ MUL a4, c14, t2
+ MUL a4, c15, t3
+ MUL a4, c16, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b1, c10, c10
+ MUL b1, c11, c11
+ MUL b1, c12, c12
+
+ MUL b2, c09, t1
+ MUL b2, c10, t2
+ MUL b2, c11, t3
+ MUL b2, c12, t4
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+ SUB c07, t3, c07
+ SUB c08, t4, c08
+
+ MUL b3, c09, t1
+ MUL b3, c10, t2
+ MUL b3, c11, t3
+ MUL b3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+ MUL a1, c07, c07
+ MUL a1, c08, c08
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+ MUL a2, c07, t3
+ MUL a2, c08, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+ MUL a3, c03, c03
+ MUL a3, c04, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+
+ ST c03, 8 * SIZE(BO)
+ ST c07, 9 * SIZE(BO)
+ ST c11, 10 * SIZE(BO)
+ ST c15, 11 * SIZE(BO)
+
+ ST c04, 12 * SIZE(BO)
+ ST c08, 13 * SIZE(BO)
+ ST c12, 14 * SIZE(BO)
+ ST c16, 15 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c05, 4 * SIZE(AO)
+ ST c06, 5 * SIZE(AO)
+ ST c07, 6 * SIZE(AO)
+ ST c08, 7 * SIZE(AO)
+
+ ST c09, 8 * SIZE(AO)
+ ST c10, 9 * SIZE(AO)
+ ST c11, 10 * SIZE(AO)
+ ST c12, 11 * SIZE(AO)
+
+ ST c13, 12 * SIZE(AO)
+ ST c14, 13 * SIZE(AO)
+ ST c15, 14 * SIZE(AO)
+ ST c16, 15 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+ ldi C3, -4 * SIZE(C3)
+ ldi C4, -4 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+ ST c07, 2 * SIZE(C2)
+ ST c08, 3 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c11, 2 * SIZE(C3)
+ ST c12, 3 * SIZE(C3)
+
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+ ST c15, 2 * SIZE(C4)
+ ST c16, 3 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+ ldi C3, 4 * SIZE(C3)
+ ldi C4, 4 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 2, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 4, KK
+#endif
+
+#ifdef LN
+ subl KK, 4, KK
+#endif
+
+ ldi I, -1(I)
+
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 2, I
+ ble I, $L30
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c01
+ LD b4, 3 * SIZE(B)
+ fclr c05
+
+ ldi BO, 4 * SIZE(B)
+ fclr c02
+ fclr c06
+ ble KK, $L28
+
+ ble L, $L25
+
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c10
+ LD a4, 3 * SIZE(AO)
+ fclr c14
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c01
+ LD b4, 3 * SIZE(BO)
+ fclr c05
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c02
+ fclr c06
+ ble TMP1, $L28
+
+ ble L, $L25
+#endif
+ .align 4
+
+$L22:
+ ADD c09, t1, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD c09, t1, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD c10, t2, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD c14, t4, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD c06, t4, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD c09, t1, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+
+ ADD c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD c06, t4, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD c09, t1, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD c10, t2, c10
+ MUL a2, b1, t2
+ ADD c13, t3, c13
+ MUL a1, b2, t3
+
+ ADD c14, t4, c14
+ MUL a2, b2, t4
+ ADD c01, t1, c01
+ MUL a1, b3, t1
+
+ ADD c02, t2, c02
+ MUL a2, b3, t2
+ ADD c05, t3, c05
+ MUL a1, b4, t3
+
+ ADD c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c09, t1, c09
+ ADD c10, t2, c10
+ ADD c13, t3, c13
+ ADD c14, t4, c14
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+
+ SUB b1, c02, c02
+ SUB b2, c06, c06
+ SUB b3, c10, c10
+ SUB b4, c14, c14
+
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c05, c05
+ SUB a4, c06, c06
+
+ SUB b1, c09, c09
+ SUB b2, c10, c10
+ SUB b3, c13, c13
+ SUB b4, c14, c14
+#endif
+
+#ifdef LN
+ LD a1, 3 * SIZE(AO)
+ LD a2, 2 * SIZE(AO)
+ LD a3, 0 * SIZE(AO)
+
+ MUL a1, c02, c02
+ MUL a1, c06, c06
+ MUL a1, c10, c10
+ MUL a1, c14, c14
+
+ MUL a2, c02, t1
+ MUL a2, c06, t2
+ MUL a2, c10, t3
+ MUL a2, c14, t4
+
+ SUB c01, t1, c01
+ SUB c05, t2, c05
+ SUB c09, t3, c09
+ SUB c13, t4, c13
+
+ MUL a3, c01, c01
+ MUL a3, c05, c05
+ MUL a3, c09, c09
+ MUL a3, c13, c13
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 3 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+
+ MUL a2, c01, t1
+ MUL a2, c05, t2
+ MUL a2, c09, t3
+ MUL a2, c13, t4
+
+ SUB c02, t1, c02
+ SUB c06, t2, c06
+ SUB c10, t3, c10
+ SUB c14, t4, c14
+
+ MUL a3, c02, c02
+ MUL a3, c06, c06
+ MUL a3, c10, c10
+ MUL a3, c14, c14
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ MUL a2, c01, t1
+ MUL a2, c02, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a4, c01, t1
+ MUL a4, c02, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b1, c06, c06
+
+ MUL b2, c05, t1
+ MUL b2, c06, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL b3, c05, t1
+ MUL b3, c06, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ MUL a2, c09, t1
+ MUL a2, c10, t2
+
+ SUB c13, t1, c13
+ SUB c14, t2, c14
+
+ MUL a3, c13, c13
+ MUL a3, c14, c14
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a1, c14, c14
+
+ MUL a2, c13, t1
+ MUL a2, c14, t2
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a3, c13, t1
+ MUL a3, c14, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL a4, c13, t1
+ MUL a4, c14, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b1, c10, c10
+
+ MUL b2, c09, t1
+ MUL b2, c10, t2
+
+ SUB c05, t1, c05
+ SUB c06, t2, c06
+
+ MUL b3, c09, t1
+ MUL b3, c10, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a1, c06, c06
+
+ MUL a2, c05, t1
+ MUL a2, c06, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a3, c01, c01
+ MUL a3, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+
+ ST c02, 4 * SIZE(BO)
+ ST c06, 5 * SIZE(BO)
+ ST c10, 6 * SIZE(BO)
+ ST c14, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c05, 2 * SIZE(AO)
+ ST c06, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c13, 6 * SIZE(AO)
+ ST c14, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+ ldi C3, -2 * SIZE(C3)
+ ldi C4, -2 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c06, 1 * SIZE(C2)
+
+ ST c09, 0 * SIZE(C3)
+ ST c10, 1 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+ ST c14, 1 * SIZE(C4)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+ ldi C3, 2 * SIZE(C3)
+ ldi C4, 2 * SIZE(C4)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, 1 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ .align 4
+
+$L30:
+ and M, 1, I
+ ble I, $L39
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(B)
+ ldi L, -2(KK)
+ LD b2, 1 * SIZE(B)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(B)
+ fclr c09
+ LD b4, 3 * SIZE(B)
+ fclr c13
+
+ ldi BO, 4 * SIZE(B)
+ ble KK, $L38
+
+ ble L, $L35
+#else
+#ifdef LN
+ sll K, BASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, BASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c01
+ LD a2, 1 * SIZE(AO)
+ fclr c05
+
+ LD b1, 0 * SIZE(BO)
+ ldi L, -2(TMP1)
+ LD b2, 1 * SIZE(BO)
+ ldi AO, 1 * SIZE(AO)
+
+ LD b3, 2 * SIZE(BO)
+ fclr c09
+ LD b4, 3 * SIZE(BO)
+ fclr c13
+
+ ldi BO, 4 * SIZE(BO)
+ ble TMP1, $L38
+
+ ble L, $L35
+#endif
+ .align 4
+
+$L32:
+ ADD c01, t1, c01
+ ldi L, -2(L)
+ MUL a1, b1, t1
+ LD b1, 0 * SIZE(BO)
+
+ ADD c05, t2, c05
+ ldi AO, 2 * SIZE(AO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, c09
+ LD b5, 3 * SIZE(BO)
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a1, b4, t4
+ LD a1, -1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ MUL a2, b1, t1
+ LD b1, 4 * SIZE(BO)
+ ldi BO, 8 * SIZE(BO)
+
+ ADD c05, t2, c05
+ MUL a2, b2, t2
+ LD b2, -3 * SIZE(BO)
+
+ ADD c09, t3, c09
+ LD b4, -1 * SIZE(BO)
+ MUL a2, b3, t3
+ LD b3, -2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a2, b5, t4
+ LD a2, 0 * SIZE(AO)
+ bgt L, $L32
+ .align 4
+
+$L35:
+ ADD c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L37
+#else
+ blbs TMP1, $L37
+#endif
+ .align 4
+
+ ADD c05, t2, c05
+ LD b1, 0 * SIZE(BO)
+ MUL a1, b2, t2
+ LD b2, 1 * SIZE(BO)
+
+ ADD c09, t3, c09
+ MUL a1, b3, t3
+ LD b3, 2 * SIZE(BO)
+
+ ADD c13, t4, c13
+ MUL a1, b4, t4
+ LD a1, 0 * SIZE(AO)
+ ldi AO, 1 * SIZE(AO)
+
+ ADD c01, t1, c01
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L37:
+ ADD c05, t2, c05
+ MUL a1, b2, t2
+ ADD c09, t3, c09
+ MUL a1, b3, t3
+
+ ADD c13, t4, c13
+ ldi AO, 1 * SIZE(AO)
+ MUL a1, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD c01, t1, c01
+ ADD c05, t2, c05
+ ADD c09, t3, c09
+ ADD c13, t4, c13
+
+$L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 4, TMP1
+#endif
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -1 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c05, c05
+ SUB a3, c09, c09
+ SUB a4, c13, c13
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+
+ MUL a1, c01, c01
+ MUL a1, c05, c05
+ MUL a1, c09, c09
+ MUL a1, c13, c13
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a1, c01, c01
+ MUL a2, c01, t1
+ SUB c05, t1, c05
+ MUL a3, c01, t1
+ SUB c09, t1, c09
+ MUL a4, c01, t1
+ SUB c13, t1, c13
+
+ LD b1, 5 * SIZE(BO)
+ LD b2, 6 * SIZE(BO)
+ LD b3, 7 * SIZE(BO)
+
+ MUL b1, c05, c05
+ MUL b2, c05, t1
+ SUB c09, t1, c09
+ MUL b3, c05, t1
+ SUB c13, t1, c13
+
+ LD a1, 10 * SIZE(BO)
+ LD a2, 11 * SIZE(BO)
+ LD a3, 15 * SIZE(BO)
+
+ MUL a1, c09, c09
+ MUL a2, c09, t1
+ SUB c13, t1, c13
+ MUL a3, c13, c13
+#endif
+
+#ifdef RT
+ LD a1, 15 * SIZE(BO)
+ LD a2, 14 * SIZE(BO)
+ LD a3, 13 * SIZE(BO)
+ LD a4, 12 * SIZE(BO)
+
+ MUL a1, c13, c13
+ MUL a2, c13, t1
+ SUB c09, t1, c09
+ MUL a3, c13, t1
+ SUB c05, t1, c05
+ MUL a4, c13, t1
+ SUB c01, t1, c01
+
+ LD b1, 10 * SIZE(BO)
+ LD b2, 9 * SIZE(BO)
+ LD b3, 8 * SIZE(BO)
+
+ MUL b1, c09, c09
+ MUL b2, c09, t1
+ SUB c05, t1, c05
+ MUL b3, c09, t1
+ SUB c01, t1, c01
+
+ LD a1, 5 * SIZE(BO)
+ LD a2, 4 * SIZE(BO)
+ LD a3, 0 * SIZE(BO)
+
+ MUL a1, c05, c05
+ MUL a2, c05, t1
+ SUB c01, t1, c01
+ MUL a3, c01, c01
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c05, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c13, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c05, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c13, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -1 * SIZE(C1)
+ ldi C2, -1 * SIZE(C2)
+ ldi C3, -1 * SIZE(C3)
+ ldi C4, -1 * SIZE(C4)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c05, 0 * SIZE(C2)
+ ST c09, 0 * SIZE(C3)
+ ST c13, 0 * SIZE(C4)
+
+#ifdef RT
+ sll K, 0 + BASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, BASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, BASE_SHIFT + 2, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L39:
+#ifdef LN
+ sll K, 2 + BASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 4, KK
+#endif
+
+#ifdef RT
+ subl KK, 4, KK
+#endif
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zamax.S b/kernel/sw_64/zamax.S
new file mode 100644
index 0000000..c453e9d
--- /dev/null
+++ b/kernel/sw_64/zamax.S
@@ -0,0 +1,302 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#endif
+
+#define STACKSIZE 8 * 8
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fclr $f16
+ cmplt $31, N, $2
+
+ fstd $f3, 8($sp)
+ fclr $f17
+ cmplt $31, INCX, $3
+ unop
+
+ fstd $f4, 16($sp)
+ fclr $f18
+ SXADDQ INCX, $31, INCX
+ unop
+
+ fstd $f5, 24($sp)
+ fclr $f19
+ and $2, $3, $0
+ unop
+
+ fstd $f6, 32($sp)
+ unop
+
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ fclr $f0
+ beq $0, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ sra N, 2, $1
+ addl INCX, INCX, INCX
+
+ fabs $f20, $f20
+ fabs $f21, $f21
+ faddd $f20, $f21, $f0
+ ble $1, $L15
+ .align 4
+
+ ldi $1, -1($1)
+ unop
+ addl X, INCX, X
+ unop
+
+ LD $f22, 0 * SIZE(X)
+ fmov $f0, $f1
+ LD $f23, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ fmov $f0, $f2
+ LD $f25, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ fmov $f0, $f3
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f8
+ fabs $f21, $f9
+ fabs $f22, $f10
+ fabs $f23, $f11
+
+ fabs $f24, $f12
+ fabs $f25, $f13
+ fabs $f26, $f14
+ fabs $f27, $f15
+
+ ble $1, $L14
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ ldi $1, -1($1)
+ addl X, INCX, X
+
+ LD $f22, 0 * SIZE(X)
+ LD $f23, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ LD $f25, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ble $1, $L13
+ .align 4
+
+$L12:
+ faddd $f8, $f9, $f16
+ unop
+ fabs $f20, $f8
+ fillcs 64 * SIZE(X)
+
+ faddd $f10, $f11, $f17
+ unop
+ fabs $f21, $f9
+ LD $f20, 0 * SIZE(X)
+
+ faddd $f12, $f13, $f18
+ LD $f21, 1 * SIZE(X)
+ fabs $f22, $f10
+ addl X, INCX, X
+
+ faddd $f14, $f15, $f19
+ LD $f22, 0 * SIZE(X)
+ fabs $f23, $f11
+ unop
+
+ CMPLT($f0, $f16), $f4
+ LD $f23, 1 * SIZE(X)
+ fabs $f24, $f12
+ addl X, INCX, X
+
+ CMPLT($f1, $f17), $f5
+ LD $f24, 0 * SIZE(X)
+ fabs $f25, $f13
+ unop
+
+ CMPLT($f2, $f18), $f6
+ LD $f25, 1 * SIZE(X)
+ fabs $f26, $f14
+ addl X, INCX, X
+
+ CMPLT($f3, $f19), $f7
+ LD $f26, 0 * SIZE(X)
+ fabs $f27, $f15
+ unop
+
+ fselne $f4, $f16, $f0, $f0
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ldi $1, -1($1) # i --
+
+ fselne $f5, $f17, $f1, $f1
+ fselne $f6, $f18, $f2, $f2
+ fselne $f7, $f19, $f3, $f3
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ faddd $f8, $f9, $f16
+ fabs $f20, $f8
+
+ faddd $f10, $f11, $f17
+ fabs $f21, $f9
+
+ faddd $f12, $f13, $f18
+ fabs $f22, $f10
+
+ faddd $f14, $f15, $f19
+ fabs $f23, $f11
+
+ CMPLT($f0, $f16), $f4
+ fabs $f24, $f12
+
+ CMPLT($f1, $f17), $f5
+ fabs $f25, $f13
+
+ CMPLT($f2, $f18), $f6
+ fabs $f26, $f14
+ CMPLT($f3, $f19), $f7
+ fabs $f27, $f15
+
+ fselne $f4, $f16, $f0, $f0
+ fselne $f5, $f17, $f1, $f1
+ fselne $f6, $f18, $f2, $f2
+ fselne $f7, $f19, $f3, $f3
+ .align 4
+
+$L14:
+ faddd $f8, $f9, $f16
+ faddd $f10, $f11, $f17
+ faddd $f12, $f13, $f18
+ faddd $f14, $f15, $f19
+
+ CMPLT($f0, $f16), $f4
+ CMPLT($f1, $f17), $f5
+ CMPLT($f2, $f18), $f6
+ CMPLT($f3, $f19), $f7
+
+ fselne $f4, $f16, $f0, $f0
+ fselne $f5, $f17, $f1, $f1
+ fselne $f6, $f18, $f2, $f2
+ fselne $f7, $f19, $f3, $f3
+
+ CMPLT($f0, $f1), $f16
+ CMPLT($f2, $f3), $f17
+
+ fselne $f16, $f1, $f0, $f0
+ fselne $f17, $f3, $f2, $f2
+
+ CMPLT($f0, $f2), $f16
+ fselne $f16, $f2, $f0, $f0
+ .align 4
+
+$L15:
+ and N, 3, $1
+ unop
+ unop
+ ble $1, $End
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ fabs $f21, $f30
+ faddd $f29, $f30, $f20
+ fmov $f20,$f29
+
+ CMPLT($f0, $f29), $f16
+ fselne $f16, $f29, $f0, $f0
+
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/zamax.S.bak b/kernel/sw_64/zamax.S.bak
new file mode 100644
index 0000000..74b9331
--- /dev/null
+++ b/kernel/sw_64/zamax.S.bak
@@ -0,0 +1,301 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+
+#ifndef USE_MIN
+#define CMPLT(a, b) fcmplt a, b
+#else
+#define CMPLT(a, b) fcmplt b, a
+#endif
+
+#define STACKSIZE 8 * 8
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, STACKSIZE, $26, 0
+
+ ldi $sp, -STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fclr $f16
+ cmplt $31, N, $2
+
+ fstd $f3, 8($sp)
+ fclr $f17
+ cmplt $31, INCX, $3
+ unop
+
+ fstd $f4, 16($sp)
+ fclr $f18
+ SXADDQ INCX, $31, INCX
+ unop
+
+ fstd $f5, 24($sp)
+ fclr $f19
+ and $2, $3, $0
+ unop
+
+ fstd $f6, 32($sp)
+ unop
+
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ fclr $f0
+ beq $0, $End # if (n <= 0) or (incx <= 0) return
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ sra N, 2, $1
+ addl INCX, INCX, INCX
+
+ fabs $f20, $f20
+ fabs $f21, $f21
+ faddd $f20, $f21, $f0
+ ble $1, $L15
+ .align 4
+
+ ldi $1, -1($1)
+ unop
+ addl X, INCX, X
+ unop
+
+ LD $f22, 0 * SIZE(X)
+ fmov $f0, $f1
+ LD $f23, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ fmov $f0, $f2
+ LD $f25, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ fmov $f0, $f3
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+
+ fabs $f20, $f8
+ fabs $f21, $f9
+ fabs $f22, $f10
+ fabs $f23, $f11
+
+ fabs $f24, $f12
+ fabs $f25, $f13
+ fabs $f26, $f14
+ fabs $f27, $f15
+
+ ble $1, $L14
+ .align 4
+
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ ldi $1, -1($1)
+ addl X, INCX, X
+
+ LD $f22, 0 * SIZE(X)
+ LD $f23, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f24, 0 * SIZE(X)
+ LD $f25, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ LD $f26, 0 * SIZE(X)
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ble $1, $L13
+ .align 4
+
+$L12:
+ faddd $f8, $f9, $f16
+ unop
+ fabs $f20, $f8
+ fillcs 64 * SIZE(X)
+
+ faddd $f10, $f11, $f17
+ unop
+ fabs $f21, $f9
+ LD $f20, 0 * SIZE(X)
+
+ faddd $f12, $f13, $f18
+ LD $f21, 1 * SIZE(X)
+ fabs $f22, $f10
+ addl X, INCX, X
+
+ faddd $f14, $f15, $f19
+ LD $f22, 0 * SIZE(X)
+ fabs $f23, $f11
+ unop
+
+ CMPLT($f0, $f16), $f4
+ LD $f23, 1 * SIZE(X)
+ fabs $f24, $f12
+ addl X, INCX, X
+
+ CMPLT($f1, $f17), $f5
+ LD $f24, 0 * SIZE(X)
+ fabs $f25, $f13
+ unop
+
+ CMPLT($f2, $f18), $f6
+ LD $f25, 1 * SIZE(X)
+ fabs $f26, $f14
+ addl X, INCX, X
+
+ CMPLT($f3, $f19), $f7
+ LD $f26, 0 * SIZE(X)
+ fabs $f27, $f15
+ unop
+
+fselne $f4,$f16,$f0, $f0
+ LD $f27, 1 * SIZE(X)
+ addl X, INCX, X
+ ldi $1, -1($1) # i --
+
+fselne $f5,$f17,$f1, $f1
+fselne $f6,$f18,$f2, $f2
+fselne $f7,$f19,$f3, $f3
+ bgt $1,$L12
+ .align 4
+
+$L13:
+ faddd $f8, $f9, $f16
+ fabs $f20, $f8
+
+ faddd $f10, $f11, $f17
+ fabs $f21, $f9
+
+ faddd $f12, $f13, $f18
+ fabs $f22, $f10
+
+ faddd $f14, $f15, $f19
+ fabs $f23, $f11
+
+ CMPLT($f0, $f16), $f4
+ fabs $f24, $f12
+
+ CMPLT($f1, $f17), $f5
+ fabs $f25, $f13
+
+ CMPLT($f2, $f18), $f6
+ fabs $f26, $f14
+ CMPLT($f3, $f19), $f7
+ fabs $f27, $f15
+
+fselne $f4,$f16,$f0, $f0
+fselne $f5,$f17,$f1, $f1
+fselne $f6,$f18,$f2, $f2
+fselne $f7,$f19,$f3, $f3
+ .align 4
+
+$L14:
+ faddd $f8, $f9, $f16
+ faddd $f10, $f11, $f17
+ faddd $f12, $f13, $f18
+ faddd $f14, $f15, $f19
+
+ CMPLT($f0, $f16), $f4
+ CMPLT($f1, $f17), $f5
+ CMPLT($f2, $f18), $f6
+ CMPLT($f3, $f19), $f7
+
+fselne $f4,$f16,$f0, $f0
+fselne $f5,$f17,$f1, $f1
+fselne $f6,$f18,$f2, $f2
+fselne $f7,$f19,$f3, $f3
+
+ CMPLT($f0, $f1), $f16
+ CMPLT($f2, $f3), $f17
+
+fselne $f16,$f1,$f0, $f0
+fselne $f17,$f3,$f2, $f2
+
+ CMPLT($f0, $f2), $f16
+fselne $f16,$f2,$f0, $f0
+ .align 4
+
+$L15:
+ and N, 3, $1
+ unop
+ unop
+ ble $1, $End
+ .align 4
+
+$L16:
+ LD $f20, 0 * SIZE(X)
+ LD $f21, 1 * SIZE(X)
+ unop
+ addl X, INCX, X
+
+ fabs $f20, $f29
+ fabs $f21, $f30
+ faddd $f29, $f30, $f29
+
+ CMPLT($f0, $f29), $f16
+fselne $f16,$f29,$f0, $f0
+
+ ldi $1, -1($1) # i --
+ bgt $1, $L16
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldi $sp, STACKSIZE($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/zasum.S b/kernel/sw_64/zasum.S
new file mode 100644
index 0000000..72e120c
--- /dev/null
+++ b/kernel/sw_64/zasum.S
@@ -0,0 +1,231 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define I $19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+#define t4 $f24
+#define s4 $f27
+ PROLOGUE
+ PROFCODE
+
+ fclr s0
+ unop
+ fclr t0
+ addl INCX, INCX, INCX
+
+ fclr s1
+ unop
+ fclr t1
+ ble N, $L999
+
+ fclr s2
+ sra N, 2, I
+ fclr s3
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ fclr t2
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a2, 0 * SIZE(X)
+ fclr t3
+ LD a3, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ ldi I, -1(I)
+
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, s4
+ fmov s4,s0
+ fillcs PREFETCHSIZE * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ LD a6, 0 * SIZE(X)
+ fabs a1, t1
+ unop
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ LD a7, 1 * SIZE(X)
+ fabs a2, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a0, 0 * SIZE(X)
+ fabs a3, t3
+ unop
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD a1, 1 * SIZE(X)
+ fabs a4, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ LD a2, 0 * SIZE(X)
+ fabs a5, t1
+ unop
+
+ fadds s2, t2, s4
+ fmov s4,s2
+ LD a3, 1 * SIZE(X)
+ fabs a6, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a4, 0 * SIZE(X)
+ fabs a7, t3
+ unop
+
+ LD a5, 1 * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD a6, 0 * SIZE(X)
+ fabs a0, t0
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ LD a7, 1 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ fabs a2, t2
+ ADD s3, t3, s4
+ fmov s4,s3
+ fabs a3, t3
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ fabs a4, t0
+ ADD s1, t1, s4
+ fmov s4,s1
+ fabs a5, t1
+ ADD s2, t2, s4
+ fmov s4,s2
+ fabs a6, t2
+ ADD s3, t3, s4
+ fmov s4,s3
+ fabs a7, t3
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ ADD s3, t3, s4
+ fmov s4,s3
+
+ .align 4
+
+$L15:
+ ADD s0, s2, $f25
+ fmov $f25, s0
+ and N, 3, I
+ ADD s1, s3, $f25
+ fmov $f25, s1
+ ble I, $L999
+ .align 4
+
+$L17:
+ ADD s0, t0, $f25
+ fmov $f25, s0
+ LD a0, 0 * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, $f25
+ fmov $f25, s1
+ LD a1, 1 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ bne I, $L17
+ .align 4
+
+$L999:
+ ADD s0, t0, $f25
+ ADD s1, t1, $f26
+
+ ADD $f25, $f26, s0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zasum.S.bak b/kernel/sw_64/zasum.S.bak
new file mode 100644
index 0000000..db79771
--- /dev/null
+++ b/kernel/sw_64/zasum.S.bak
@@ -0,0 +1,208 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define I $19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+
+ PROLOGUE
+ PROFCODE
+
+ fclr s0
+ unop
+ fclr t0
+ addl INCX, INCX, INCX
+
+ fclr s1
+ unop
+ fclr t1
+ ble N, $L999
+
+ fclr s2
+ sra N, 2, I
+ fclr s3
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ fclr t2
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a2, 0 * SIZE(X)
+ fclr t3
+ LD a3, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ ldi I, -1(I)
+
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, s0
+ fillcs PREFETCHSIZE * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, s1
+ LD a6, 0 * SIZE(X)
+ fabs a1, t1
+ unop
+
+ ADD s2, t2, s2
+ LD a7, 1 * SIZE(X)
+ fabs a2, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ fabs a3, t3
+ unop
+
+ ADD s0, t0, s0
+ LD a1, 1 * SIZE(X)
+ fabs a4, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, s1
+ LD a2, 0 * SIZE(X)
+ fabs a5, t1
+ unop
+
+ ADD s2, t2, s2
+ LD a3, 1 * SIZE(X)
+ fabs a6, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ fabs a7, t3
+ unop
+
+ LD a5, 1 * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, s0
+ LD a6, 0 * SIZE(X)
+ fabs a0, t0
+
+ ADD s1, t1, s1
+ LD a7, 1 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ fabs a2, t2
+ ADD s3, t3, s3
+ fabs a3, t3
+
+ ADD s0, t0, s0
+ fabs a4, t0
+ ADD s1, t1, s1
+ fabs a5, t1
+ ADD s2, t2, s2
+ fabs a6, t2
+ ADD s3, t3, s3
+ fabs a7, t3
+
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+ .align 4
+
+$L15:
+ ADD s0, s2, s0
+ and N, 3, I
+ ADD s1, s3, s1
+ ble I, $L999
+ .align 4
+
+$L17:
+ ADD s0, t0, s0
+ LD a0, 0 * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, s1
+ LD a1, 1 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ bne I, $L17
+ .align 4
+
+$L999:
+ ADD s0, t0, s0
+ ADD s1, t1, s1
+
+ ADD s0, s1, s0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zasum_simd.S b/kernel/sw_64/zasum_simd.S
new file mode 100644
index 0000000..5606fdf
--- /dev/null
+++ b/kernel/sw_64/zasum_simd.S
@@ -0,0 +1,385 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 96
+
+#define N $16
+#define X $17
+#define INCX $18
+#define I $19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+
+#define t4 $f24
+#define t5 $f25
+#define t6 $f26
+#define t7 $f27
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ fclr s0
+ unop
+ fclr t0
+ addl INCX, INCX, INCX
+
+ fclr s1
+ unop
+ fclr t1
+ ble N, $L999
+
+ cmpeq INCX, 2, $3
+ beq $3, $Sub
+ .align 4
+
+ and X, (VEC_LEN*SIZE-1), $6
+ bgt $6, $UnAlign_X_ACCESS
+ .align 4
+$Align_Access:
+
+/*
+ Unloop 8*2= 16 reals
+*/
+ sra N, 3, I
+ fclr s2
+ fclr s3
+ ble I, $Remain
+
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t0
+ VLD a1, 1*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t1
+
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t2
+ VLD a3, 3*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t3
+
+ subl I, 1, I
+ addl X, 16*SIZE, X
+ unop
+ ble I, $MainLoopEnd
+
+$MainLoop:
+ vcpys $f31, a0, a4
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, a1, a5
+ VLD a1, 1*VEC_LEN*SIZE(X)
+
+ vcpys $f31, a2, a6
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ vcpys $f31, a3, a7
+ VLD a3, 3*VEC_LEN*SIZE(X)
+
+ VADD t0, a4, t0
+ subl I, 1, I
+ VADD t1, a5, t1
+ fillcs PREFETCHSIZE * SIZE(X)
+
+ VADD t2, a6, t2
+ addl X, 16*SIZE, X
+ VADD t3, a7, t3
+ bgt I, $MainLoop
+
+$MainLoopEnd:
+ /*fabs*/
+
+ vcpys $f31, a0, a4
+ vcpys $f31, a1, a5
+ vcpys $f31, a2, a6
+ vcpys $f31, a3, a7
+
+ VADD t0, a4, t0
+ VADD t1, a5, t1
+ VADD t2, a6, t2
+ VADD t3, a7, t3
+
+ VADD t0, t1, t0
+ VADD t2, t3, t2
+ VADD t0, t2, t0
+ nop
+
+ vextf t0, 0, s0
+ vextf t0, 1, s1
+ vextf t0, 2, s2
+ vextf t0, 3, s3
+
+$Remain:
+ and N, 7, I
+ ADD s0, s2, s0
+ ADD s1, s3, s1
+ ble I, $End
+ .align 4
+
+$RemainLoop:
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ fabs a0, t0
+ addl X, 2*SIZE, X
+
+ fabs a1, t1
+ ldi I, -1(I)
+ ADD s0, t0, s0
+ ADD s1, t1, s1
+
+ bne I, $RemainLoop
+ .align 4
+$End:
+ ADD s0, s1, s0
+ ret
+ .align 4
+
+$UnAlign_X_ACCESS:
+ sra N, 3, I
+ fclr s2
+ fclr s3
+ ble I, $Remain
+
+ VLD_UL a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t0
+ VLD_UH t4, 1*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t1
+
+ VLD_UL a1, 1*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t2
+ VLD_UH t5, 2*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t3
+
+ VLD_UL a2, 2*VEC_LEN*SIZE(X)
+ VLD_UH t6, 3*VEC_LEN*SIZE(X)
+ VLD_UL a3, 3*VEC_LEN*SIZE(X)
+ VLD_UH t7, 4*VEC_LEN*SIZE(X)
+
+ vbisw a0, t4, a0
+ subl I, 1, I
+ vbisw a1, t5, a1
+ addl X, 16*SIZE, X
+
+ vbisw a2, t6, a2
+ unop
+ vbisw a3, t7, a3
+ ble I, $MainLoopEnd
+
+$UnAlign_X_ACCESS_MainLoop:
+/*fabs*/
+ vcpys $f31, a0, a4
+ VLD_UL a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, a1, a5
+ VLD_UH t4, 1*VEC_LEN*SIZE(X)
+
+ vcpys $f31, a2, a6
+ VLD_UL a1, 1*VEC_LEN*SIZE(X)
+ vcpys $f31, a3, a7
+ VLD_UH t5, 2*VEC_LEN*SIZE(X)
+
+ VADD t0, a4, t0
+ VLD_UL a2, 2*VEC_LEN*SIZE(X)
+ VADD t1, a5, t1
+ VLD_UH t6, 3*VEC_LEN*SIZE(X)
+
+ VADD t2, a6, t2
+ VLD_UL a3, 3*VEC_LEN*SIZE(X)
+ VADD t3, a7, t3
+ VLD_UH t7, 4*VEC_LEN*SIZE(X)
+
+
+ vbisw a0, t4, a0
+ subl I, 1, I
+ vbisw a1, t5, a1
+ fillcs PREFETCHSIZE * SIZE(X)
+
+ vbisw a2, t6, a2
+ addl X, 16*SIZE, X
+ vbisw a3, t7, a3
+ bgt I, $UnAlign_X_ACCESS_MainLoop
+
+ jmp $MainLoopEnd
+ .align 4
+
+
+$Sub:
+ fclr s2
+ sra N, 2, I
+ fclr s3
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ fclr t2
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a2, 0 * SIZE(X)
+ fclr t3
+ LD a3, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ ldi I, -1(I)
+
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, s0
+ fillcs PREFETCHSIZE * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, s1
+ LD a6, 0 * SIZE(X)
+ fabs a1, t1
+ unop
+
+ ADD s2, t2, s2
+ LD a7, 1 * SIZE(X)
+ fabs a2, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ fabs a3, t3
+ unop
+
+ ADD s0, t0, s0
+ LD a1, 1 * SIZE(X)
+ fabs a4, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, s1
+ LD a2, 0 * SIZE(X)
+ fabs a5, t1
+ unop
+
+ ADD s2, t2, s2
+ LD a3, 1 * SIZE(X)
+ fabs a6, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ fabs a7, t3
+ unop
+
+ LD a5, 1 * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, s0
+ LD a6, 0 * SIZE(X)
+ fabs a0, t0
+
+ ADD s1, t1, s1
+ LD a7, 1 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ fabs a2, t2
+ ADD s3, t3, s3
+ fabs a3, t3
+
+ ADD s0, t0, s0
+ fabs a4, t0
+ ADD s1, t1, s1
+ fabs a5, t1
+ ADD s2, t2, s2
+ fabs a6, t2
+ ADD s3, t3, s3
+ fabs a7, t3
+
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+ .align 4
+
+$L15:
+ ADD s0, s2, s0
+ and N, 3, I
+ ADD s1, s3, s1
+ ble I, $L999
+ .align 4
+
+$L17:
+ ADD s0, t0, s0
+ LD a0, 0 * SIZE(X)
+ fabs a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, s1
+ LD a1, 1 * SIZE(X)
+ fabs a1, t1
+ SXADDQ INCX, X, X
+
+ bne I, $L17
+ .align 4
+
+$L999:
+ ADD s0, t0, s0
+ ADD s1, t1, s1
+
+ ADD s0, s1, s0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zaxpy.S b/kernel/sw_64/zaxpy.S
new file mode 100644
index 0000000..19b6398
--- /dev/null
+++ b/kernel/sw_64/zaxpy.S
@@ -0,0 +1,654 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 40
+
+#ifndef CONJ
+#define ADD1 SUB
+#define ADD2 ADD
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+#endif
+
+#define tmp $f9
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ ldw $19, 0($sp)
+ fmov $f19, $f29
+ ldl $20, 8($sp)
+ fmov $f20, $f30
+
+ mov $21, $18
+ ldw $21, 16($sp)
+ ldi $sp, -64($sp)
+ nop
+
+ fstd $f2, 0($sp)
+ cmpeq $19, 1, $1
+ fstd $f3, 8($sp)
+ cmpeq $21, 1, $2
+
+ fstd $f4, 16($sp)
+ and $16, 3, $5
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd tmp, 56($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ and $1, $2, $1
+ ble $16, $End
+ sra $16, 2, $4
+ beq $1, $Sub
+
+ ble $4, $Remain
+ subl $4, 1, $4
+
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+ LD $f2, 2*SIZE($18)
+ LD $f3, 3*SIZE($18)
+ LD $f4, 4*SIZE($18)
+ LD $f5, 5*SIZE($18)
+ LD $f6, 6*SIZE($18)
+ LD $f7, 7*SIZE($18)
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ LD $f10, 2*SIZE($20)
+ LD $f11, 3*SIZE($20)
+ LD $f12, 4*SIZE($20)
+ LD $f13, 5*SIZE($20)
+ LD $f14, 6*SIZE($20)
+ LD $f15, 7*SIZE($20)
+
+ addl $18, 8*SIZE, $18
+ ble $4, $MainLoopEnd
+ .align 4
+
+$MainLoop:
+ fillcs PREFETCHSIZE * SIZE($20)
+ fillcs PREFETCHSIZE * SIZE($18)
+
+ MUL $f29, $f0, $f20
+ fillcs 9*SIZE($18)
+ MUL $f30, $f1, $f21
+ unop
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ MUL $f29, $f2, $f24
+ unop
+ MUL $f30, $f3, $f25
+ nop
+
+ MUL $f30, $f2, $f26
+ LD $f2, 2*SIZE($18)
+ MUL $f29, $f3, $f27
+ LD $f3, 3*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ unop
+ MUL $f30, $f4, $f22
+ LD $f4, 4*SIZE($18)
+
+ ADD2 $f26, $f27, $f19
+ addl $20, 8*SIZE, $20
+ MUL $f29, $f5, $f23
+ LD $f5, 5*SIZE($18)
+
+ ADD $f16, $f8, tmp
+ fmov tmp, $f16
+ LD $f8, 0*SIZE($20)
+ MUL $f29, $f6, $f24
+ unop
+
+ ADD $f17, $f28, tmp
+ fmov tmp, $f17
+ LD $f28, 1*SIZE($20)
+ MUL $f30, $f7, $f25
+ unop
+
+ ADD $f18, $f10, tmp
+ fmov tmp, $f18
+ LD $f10, 2*SIZE($20)
+ MUL $f30, $f6, $f26
+ LD $f6, 6*SIZE($18)
+
+ ADD $f19, $f11, tmp
+ fmov tmp, $f19
+ LD $f11, 3*SIZE($20)
+ MUL $f29, $f7, $f27
+ LD $f7, 7*SIZE($18)
+
+ ST $f16,-8*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17,-7*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ ST $f18,-6*SIZE($20)
+ ADD1 $f24, $f25, $f18
+ ST $f19,-5*SIZE($20)
+ ADD2 $f26, $f27, $f19
+
+ ADD $f16, $f12, tmp
+ fmov tmp, $f16
+ LD $f12, 4*SIZE($20)
+ ADD $f17, $f13, tmp
+ fmov tmp, $f17
+ LD $f13, 5*SIZE($20)
+ ADD $f18, $f14, tmp
+ fmov tmp, $f18
+ LD $f14, 6*SIZE($20)
+ ADD $f19, $f15, tmp
+ fmov tmp, $f19
+ LD $f15, 7*SIZE($20)
+
+ ST $f16,-4*SIZE($20)
+ addl $18, 8*SIZE, $18
+ ST $f17,-3*SIZE($20)
+ subl $4, 1, $4
+
+ ST $f18,-2*SIZE($20)
+ nop
+ ST $f19,-1*SIZE($20)
+ bgt $4, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ MUL $f29, $f2, $f24
+ MUL $f30, $f3, $f25
+ MUL $f30, $f2, $f26
+ MUL $f29, $f3, $f27
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ MUL $f30, $f4, $f22
+ ADD2 $f26, $f27, $f19
+ MUL $f29, $f5, $f23
+
+ ADD $f16, $f8, tmp
+ fmov tmp, $f16
+ MUL $f29, $f6, $f24
+ ADD $f17, $f28, tmp
+ fmov tmp, $f17
+ MUL $f30, $f7, $f25
+
+ ADD $f18, $f10, tmp
+ fmov tmp, $f18
+ MUL $f30, $f6, $f26
+ ADD $f19, $f11, tmp
+ fmov tmp, $f19
+ MUL $f29, $f7, $f27
+
+ ST $f16, 0*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17, 1*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ ST $f18, 2*SIZE($20)
+ ADD1 $f24, $f25, $f18
+ ST $f19, 3*SIZE($20)
+ ADD2 $f26, $f27, $f19
+
+ ADD $f16, $f12, tmp
+ fmov tmp, $f16
+ ADD $f17, $f13, tmp
+ fmov tmp, $f17
+ ADD $f18, $f14, tmp
+ fmov tmp, $f18
+ ADD $f19, $f15, tmp
+ fmov tmp, $f19
+
+ ST $f16, 4*SIZE($20)
+ ST $f17, 5*SIZE($20)
+ ST $f18, 6*SIZE($20)
+ ST $f19, 7*SIZE($20)
+
+ unop
+ addl $20, 8*SIZE, $20
+ unop
+ ble $5, $End
+ .align 4
+
+$Remain:
+ subl $5, 1, $6
+ ble $5, $End
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ addl $18, 2*SIZE, $18
+ ble $6, $RemainLoopEnd
+ .align 4
+
+$RemainLoop:
+ MUL $f29, $f0, $f20
+ subl $6, 1, $6
+ MUL $f30, $f1, $f21
+ addl $20, 2*SIZE, $20
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, tmp
+ fmov tmp, $f16
+ LD $f8, 0*SIZE($20)
+ ADD $f17, $f28, tmp
+ fmov tmp, $f17
+ LD $f28, 1*SIZE($20)
+
+ ST $f16,-2*SIZE($20)
+ addl $18, 2*SIZE, $18
+ ST $f17,-1*SIZE($20)
+ bgt $6, $RemainLoop
+ .align 4
+
+$RemainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, tmp
+ fmov tmp, $f16
+ ADD $f17, $f28, tmp
+ fmov tmp, $f17
+
+ ST $f16, 0*SIZE($20)
+ nop
+ ST $f17, 1*SIZE($20)
+ nop
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd tmp, 56($sp)
+ ldi $sp, 64($sp)
+ ret
+ .align 4
+
+$Sub:
+ SXSUBL $16, SIZE, $22
+ addl $22, $22, $22 # Complex
+ .align 4
+
+ addl $19, $19, $19 # Complex
+ addl $21, $21, $21 # Complex
+
+ ble $4, $SubRemain
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f2, 0*SIZE($18)
+ LD $f3, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f4, 0*SIZE($18)
+ LD $f5, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f6, 0*SIZE($18)
+ LD $f7, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ SXADDQ $21, $20, $24
+
+ LD $f10, 0*SIZE($24)
+ LD $f11, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ LD $f12, 0*SIZE($24)
+ LD $f13, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ LD $f14, 0*SIZE($24)
+ LD $f15, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ subl $4, 1, $4
+ ble $4, $SubMainLoopEnd
+ .align 4
+
+$SubMainLoop:
+ MUL $f29, $f0, $f20
+ unop
+ MUL $f30, $f1, $f21
+ unop
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ MUL $f29, $f2, $f24
+ SXADDQ $19, $18, $18
+ MUL $f30, $f3, $f25
+ unop
+
+ MUL $f30, $f2, $f26
+ LD $f2, 0*SIZE($18)
+ MUL $f29, $f3, $f27
+ LD $f3, 1*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ SXADDQ $19, $18, $18
+ MUL $f29, $f4, $f20
+ unop
+
+ ADD2 $f22, $f23, $f17
+ unop
+ MUL $f30, $f5, $f21
+ unop
+
+ ADD1 $f24, $f25, $f18
+ unop
+ MUL $f30, $f4, $f22
+ LD $f4, 0*SIZE($18)
+
+ ADD2 $f26, $f27, $f19
+ unop
+ MUL $f29, $f5, $f23
+ LD $f5, 1*SIZE($18)
+
+ ADD $f16, $f8, tmp
+ fmov tmp, $f16
+ LD $f8, 0*SIZE($24)
+ MUL $f29, $f6, $f24
+ SXADDQ $19, $18, $18
+
+ ADD $f17, $f28, tmp
+ fmov tmp, $f17
+ LD $f28, 1*SIZE($24)
+ MUL $f30, $f7, $f25
+ SXADDQ $21, $24, $24
+
+ ADD $f18, $f10, tmp
+ fmov tmp, $f18
+ LD $f10, 0*SIZE($24)
+ MUL $f30, $f6, $f26
+ LD $f6, 0*SIZE($18)
+
+ ADD $f19, $f11, tmp
+ fmov tmp, $f19
+ LD $f11, 1*SIZE($24)
+ MUL $f29, $f7, $f27
+ LD $f7, 1*SIZE($18)
+
+ ST $f16, 0*SIZE($20)
+ SXADDQ $19, $18, $18
+ ADD1 $f20, $f21, $f16
+ unop
+
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ ADD2 $f22, $f23, $f17
+ unop
+
+ ST $f18, 0*SIZE($20)
+ SXADDQ $21, $24, $24
+ ADD1 $f24, $f25, $f18
+ unop
+
+ ST $f19, 1*SIZE($20)
+ unop
+ ADD2 $f26, $f27, $f19
+ SXADDQ $21, $20, $20
+
+ ADD $f16, $f12, tmp
+ fmov tmp, $f16
+ unop
+ LD $f12, 0*SIZE($24)
+ unop
+
+ ADD $f17, $f13, tmp
+ fmov tmp, $f17
+ unop
+ LD $f13, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ ADD $f18, $f14, tmp
+ fmov tmp, $f18
+ subl $4, 1, $4
+ LD $f14, 0*SIZE($24)
+ unop
+
+ ADD $f19, $f15, tmp
+ fmov tmp, $f19
+ unop
+ LD $f15, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ unop
+
+ ST $f18, 0*SIZE($20)
+ ST $f19, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ bgt $4, $SubMainLoop
+ .align 4
+
+$SubMainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ MUL $f29, $f2, $f24
+ MUL $f30, $f3, $f25
+ MUL $f30, $f2, $f26
+ MUL $f29, $f3, $f27
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ MUL $f30, $f4, $f22
+ ADD2 $f26, $f27, $f19
+ MUL $f29, $f5, $f23
+
+ ADD $f16, $f8, tmp
+ fmov tmp, $f16
+ MUL $f29, $f6, $f24
+ ADD $f17, $f28, tmp
+ fmov tmp, $f17
+ MUL $f30, $f7, $f25
+
+ ADD $f18, $f10, tmp
+ fmov tmp, $f18
+ MUL $f30, $f6, $f26
+ ADD $f19, $f11, tmp
+ fmov tmp, $f19
+ MUL $f29, $f7, $f27
+
+ ST $f16, 0*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17, 1*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ SXADDQ $21, $20, $20
+ nop
+ ST $f18, 0*SIZE($20)
+ ADD1 $f24, $f25, $f18
+
+ ST $f19, 1*SIZE($20)
+ ADD2 $f26, $f27, $f19
+ SXADDQ $21, $20, $20
+ ADD $f16, $f12, tmp
+ fmov tmp, $f16
+
+ ADD $f17, $f13, tmp
+ fmov tmp, $f17
+ ADD $f18, $f14, tmp
+ fmov tmp, $f18
+ ADD $f19, $f15, tmp
+ fmov tmp, $f19
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+
+ ST $f18, 0*SIZE($20)
+ ST $f19, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ ble $5, $SubEnd
+ .align 4
+
+$SubRemain:
+ subl $5, 1, $6
+ ble $5, $SubEnd
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ SXADDQ $19, $18, $18
+ SXADDQ $21, $20, $24
+ ble $6, $SubRemainLoopEnd
+ .align 4
+
+$SubRemainLoop:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+ ADD1 $f20, $f21, $f16
+ SXADDQ $19, $18, $18
+
+ ADD2 $f22, $f23, $f17
+ nop
+ ADD $f16, $f8, tmp
+ fmov tmp, $f16
+ LD $f8, 0*SIZE($24)
+
+ ADD $f17, $f28, tmp
+ fmov tmp, $f17
+ LD $f28, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+ subl $6, 1, $6
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ bgt $6, $SubRemainLoop
+ .align 4
+
+$SubRemainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, tmp
+ fmov tmp, $f16
+ ADD $f17, $f28, tmp
+ fmov tmp, $f17
+
+ ST $f16, 0*SIZE($20)
+ nop
+ ST $f17, 1*SIZE($20)
+ nop
+ .align 4
+
+$SubEnd:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd tmp, 56($sp)
+ ldi $sp, 64($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zaxpy.S.bak b/kernel/sw_64/zaxpy.S.bak
new file mode 100644
index 0000000..c6cd44b
--- /dev/null
+++ b/kernel/sw_64/zaxpy.S.bak
@@ -0,0 +1,611 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 40
+
+#ifndef CONJ
+#define ADD1 SUB
+#define ADD2 ADD
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+#endif
+
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ ldl $19, 0($sp)
+ fmov $f19, $f29
+ ldl $20, 8($sp)
+ fmov $f20, $f30
+
+ mov $21, $18
+ ldl $21, 16($sp)
+ ldi $sp, -64($sp)
+ nop
+
+ fstd $f2, 0($sp)
+ cmpeq $19, 1, $1
+ fstd $f3, 8($sp)
+ cmpeq $21, 1, $2
+
+ fstd $f4, 16($sp)
+ and $16, 3, $5
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ and $1, $2, $1
+ ble $16, $End
+ sra $16, 2, $4
+ beq $1, $Sub
+
+ ble $4, $Remain
+ subl $4, 1, $4
+
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+ LD $f2, 2*SIZE($18)
+ LD $f3, 3*SIZE($18)
+ LD $f4, 4*SIZE($18)
+ LD $f5, 5*SIZE($18)
+ LD $f6, 6*SIZE($18)
+ LD $f7, 7*SIZE($18)
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ LD $f10, 2*SIZE($20)
+ LD $f11, 3*SIZE($20)
+ LD $f12, 4*SIZE($20)
+ LD $f13, 5*SIZE($20)
+ LD $f14, 6*SIZE($20)
+ LD $f15, 7*SIZE($20)
+
+ addl $18, 8*SIZE, $18
+ ble $4, $MainLoopEnd
+ .align 4
+
+$MainLoop:
+ fillcs PREFETCHSIZE * SIZE($20)
+ fillcs PREFETCHSIZE * SIZE($18)
+
+ MUL $f29, $f0, $f20
+ fillcs 9*SIZE($18)
+ MUL $f30, $f1, $f21
+ unop
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ MUL $f29, $f2, $f24
+ unop
+ MUL $f30, $f3, $f25
+ nop
+
+ MUL $f30, $f2, $f26
+ LD $f2, 2*SIZE($18)
+ MUL $f29, $f3, $f27
+ LD $f3, 3*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ unop
+ MUL $f30, $f4, $f22
+ LD $f4, 4*SIZE($18)
+
+ ADD2 $f26, $f27, $f19
+ addl $20, 8*SIZE, $20
+ MUL $f29, $f5, $f23
+ LD $f5, 5*SIZE($18)
+
+ ADD $f16, $f8, $f16
+ LD $f8, 0*SIZE($20)
+ MUL $f29, $f6, $f24
+ unop
+
+ ADD $f17, $f28, $f17
+ LD $f28, 1*SIZE($20)
+ MUL $f30, $f7, $f25
+ unop
+
+ ADD $f18, $f10, $f18
+ LD $f10, 2*SIZE($20)
+ MUL $f30, $f6, $f26
+ LD $f6, 6*SIZE($18)
+
+ ADD $f19, $f11, $f19
+ LD $f11, 3*SIZE($20)
+ MUL $f29, $f7, $f27
+ LD $f7, 7*SIZE($18)
+
+ ST $f16,-8*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17,-7*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ ST $f18,-6*SIZE($20)
+ ADD1 $f24, $f25, $f18
+ ST $f19,-5*SIZE($20)
+ ADD2 $f26, $f27, $f19
+
+ ADD $f16, $f12, $f16
+ LD $f12, 4*SIZE($20)
+ ADD $f17, $f13, $f17
+ LD $f13, 5*SIZE($20)
+ ADD $f18, $f14, $f18
+ LD $f14, 6*SIZE($20)
+ ADD $f19, $f15, $f19
+ LD $f15, 7*SIZE($20)
+
+ ST $f16,-4*SIZE($20)
+ addl $18, 8*SIZE, $18
+ ST $f17,-3*SIZE($20)
+ subl $4, 1, $4
+
+ ST $f18,-2*SIZE($20)
+ nop
+ ST $f19,-1*SIZE($20)
+ bgt $4, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ MUL $f29, $f2, $f24
+ MUL $f30, $f3, $f25
+ MUL $f30, $f2, $f26
+ MUL $f29, $f3, $f27
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ MUL $f30, $f4, $f22
+ ADD2 $f26, $f27, $f19
+ MUL $f29, $f5, $f23
+
+ ADD $f16, $f8, $f16
+ MUL $f29, $f6, $f24
+ ADD $f17, $f28, $f17
+ MUL $f30, $f7, $f25
+
+ ADD $f18, $f10, $f18
+ MUL $f30, $f6, $f26
+ ADD $f19, $f11, $f19
+ MUL $f29, $f7, $f27
+
+ ST $f16, 0*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17, 1*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ ST $f18, 2*SIZE($20)
+ ADD1 $f24, $f25, $f18
+ ST $f19, 3*SIZE($20)
+ ADD2 $f26, $f27, $f19
+
+ ADD $f16, $f12, $f16
+ ADD $f17, $f13, $f17
+ ADD $f18, $f14, $f18
+ ADD $f19, $f15, $f19
+
+ ST $f16, 4*SIZE($20)
+ ST $f17, 5*SIZE($20)
+ ST $f18, 6*SIZE($20)
+ ST $f19, 7*SIZE($20)
+
+ unop
+ addl $20, 8*SIZE, $20
+ unop
+ ble $5, $End
+ .align 4
+
+$Remain:
+ subl $5, 1, $6
+ ble $5, $End
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ addl $18, 2*SIZE, $18
+ ble $6, $RemainLoopEnd
+ .align 4
+
+$RemainLoop:
+ MUL $f29, $f0, $f20
+ subl $6, 1, $6
+ MUL $f30, $f1, $f21
+ addl $20, 2*SIZE, $20
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, $f16
+ LD $f8, 0*SIZE($20)
+ ADD $f17, $f28, $f17
+ LD $f28, 1*SIZE($20)
+
+ ST $f16,-2*SIZE($20)
+ addl $18, 2*SIZE, $18
+ ST $f17,-1*SIZE($20)
+ bgt $6, $RemainLoop
+ .align 4
+
+$RemainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, $f16
+ ADD $f17, $f28, $f17
+
+ ST $f16, 0*SIZE($20)
+ nop
+ ST $f17, 1*SIZE($20)
+ nop
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ ldi $sp, 64($sp)
+ ret
+ .align 4
+
+$Sub:
+ SXSUBL $16, SIZE, $22
+ addl $22, $22, $22 # Complex
+ .align 4
+
+ addl $19, $19, $19 # Complex
+ addl $21, $21, $21 # Complex
+
+ ble $4, $SubRemain
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f2, 0*SIZE($18)
+ LD $f3, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f4, 0*SIZE($18)
+ LD $f5, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f6, 0*SIZE($18)
+ LD $f7, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ SXADDQ $21, $20, $24
+
+ LD $f10, 0*SIZE($24)
+ LD $f11, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ LD $f12, 0*SIZE($24)
+ LD $f13, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ LD $f14, 0*SIZE($24)
+ LD $f15, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ subl $4, 1, $4
+ ble $4, $SubMainLoopEnd
+ .align 4
+
+$SubMainLoop:
+ MUL $f29, $f0, $f20
+ unop
+ MUL $f30, $f1, $f21
+ unop
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ MUL $f29, $f2, $f24
+ SXADDQ $19, $18, $18
+ MUL $f30, $f3, $f25
+ unop
+
+ MUL $f30, $f2, $f26
+ LD $f2, 0*SIZE($18)
+ MUL $f29, $f3, $f27
+ LD $f3, 1*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ SXADDQ $19, $18, $18
+ MUL $f29, $f4, $f20
+ unop
+
+ ADD2 $f22, $f23, $f17
+ unop
+ MUL $f30, $f5, $f21
+ unop
+
+ ADD1 $f24, $f25, $f18
+ unop
+ MUL $f30, $f4, $f22
+ LD $f4, 0*SIZE($18)
+
+ ADD2 $f26, $f27, $f19
+ unop
+ MUL $f29, $f5, $f23
+ LD $f5, 1*SIZE($18)
+
+ ADD $f16, $f8, $f16
+ LD $f8, 0*SIZE($24)
+ MUL $f29, $f6, $f24
+ SXADDQ $19, $18, $18
+
+ ADD $f17, $f28, $f17
+ LD $f28, 1*SIZE($24)
+ MUL $f30, $f7, $f25
+ SXADDQ $21, $24, $24
+
+ ADD $f18, $f10, $f18
+ LD $f10, 0*SIZE($24)
+ MUL $f30, $f6, $f26
+ LD $f6, 0*SIZE($18)
+
+ ADD $f19, $f11, $f19
+ LD $f11, 1*SIZE($24)
+ MUL $f29, $f7, $f27
+ LD $f7, 1*SIZE($18)
+
+ ST $f16, 0*SIZE($20)
+ SXADDQ $19, $18, $18
+ ADD1 $f20, $f21, $f16
+ unop
+
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ ADD2 $f22, $f23, $f17
+ unop
+
+ ST $f18, 0*SIZE($20)
+ SXADDQ $21, $24, $24
+ ADD1 $f24, $f25, $f18
+ unop
+
+ ST $f19, 1*SIZE($20)
+ unop
+ ADD2 $f26, $f27, $f19
+ SXADDQ $21, $20, $20
+
+ ADD $f16, $f12, $f16
+ unop
+ LD $f12, 0*SIZE($24)
+ unop
+
+ ADD $f17, $f13, $f17
+ unop
+ LD $f13, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ ADD $f18, $f14, $f18
+ subl $4, 1, $4
+ LD $f14, 0*SIZE($24)
+ unop
+
+ ADD $f19, $f15, $f19
+ unop
+ LD $f15, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ unop
+
+ ST $f18, 0*SIZE($20)
+ ST $f19, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ bgt $4, $SubMainLoop
+ .align 4
+
+$SubMainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ MUL $f29, $f2, $f24
+ MUL $f30, $f3, $f25
+ MUL $f30, $f2, $f26
+ MUL $f29, $f3, $f27
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ MUL $f30, $f4, $f22
+ ADD2 $f26, $f27, $f19
+ MUL $f29, $f5, $f23
+
+ ADD $f16, $f8, $f16
+ MUL $f29, $f6, $f24
+ ADD $f17, $f28, $f17
+ MUL $f30, $f7, $f25
+
+ ADD $f18, $f10, $f18
+ MUL $f30, $f6, $f26
+ ADD $f19, $f11, $f19
+ MUL $f29, $f7, $f27
+
+ ST $f16, 0*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17, 1*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ SXADDQ $21, $20, $20
+ nop
+ ST $f18, 0*SIZE($20)
+ ADD1 $f24, $f25, $f18
+
+ ST $f19, 1*SIZE($20)
+ ADD2 $f26, $f27, $f19
+ SXADDQ $21, $20, $20
+ ADD $f16, $f12, $f16
+
+ ADD $f17, $f13, $f17
+ ADD $f18, $f14, $f18
+ ADD $f19, $f15, $f19
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+
+ ST $f18, 0*SIZE($20)
+ ST $f19, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ ble $5, $SubEnd
+ .align 4
+
+$SubRemain:
+ subl $5, 1, $6
+ ble $5, $SubEnd
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ SXADDQ $19, $18, $18
+ SXADDQ $21, $20, $24
+ ble $6, $SubRemainLoopEnd
+ .align 4
+
+$SubRemainLoop:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+ ADD1 $f20, $f21, $f16
+ SXADDQ $19, $18, $18
+
+ ADD2 $f22, $f23, $f17
+ nop
+ ADD $f16, $f8, $f16
+ LD $f8, 0*SIZE($24)
+
+ ADD $f17, $f28, $f17
+ LD $f28, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+ subl $6, 1, $6
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ bgt $6, $SubRemainLoop
+ .align 4
+
+$SubRemainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, $f16
+ ADD $f17, $f28, $f17
+
+ ST $f16, 0*SIZE($20)
+ nop
+ ST $f17, 1*SIZE($20)
+ nop
+ .align 4
+
+$SubEnd:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ ldi $sp, 64($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zaxpy_simd.S b/kernel/sw_64/zaxpy_simd.S
new file mode 100644
index 0000000..a823ebf
--- /dev/null
+++ b/kernel/sw_64/zaxpy_simd.S
@@ -0,0 +1,1479 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 128
+
+#ifndef CONJ
+#define ADD1 SUB
+#define ADD2 ADD
+
+#define VADD1 VSUB
+#define VADD2 VADD
+#define VMAD1 VNMAD
+#define VMAD2 VMAD
+
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+
+#define VADD1 VADD
+#define VADD2 VSUB
+#define VMAD1 VMAD
+#define VMAD2 VNMAD
+
+#endif
+
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 64, $26, 0
+
+ ldl $19, 0($sp)
+ fmov $f19, $f29
+ ldl $20, 8($sp)
+ fmov $f20, $f30
+
+ mov $21, $18
+ ldl $21, 16($sp)
+ ldi $sp, -64($sp)
+ nop
+
+ fstd $f2, 0($sp)
+ cmpeq $19, 1, $1
+ fstd $f3, 8($sp)
+ cmpeq $21, 1, $2
+
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ nop
+
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+/*
+ unloop 8: process 8 complex=16 float/double
+*/
+ and $1, $2, $1
+ ble $16, $End
+ sra $16, 3, $4
+ and $16, 7, $5
+
+ beq $1, $Sub
+ ble $4, $Remain
+ subl $4, 1, $4
+ nop
+/*extern alpha_r alpha_i to vector*/
+
+ vcpyf $f29, $f29
+ vcpyf $f30, $f30
+
+/**
+ align ?
+ test the address of Y & X
+**/
+ and $20, (VEC_LEN*SIZE-1), $6
+ bgt $6, $UnAlign_Y_ACCESS
+
+ and $18, (VEC_LEN*SIZE-1), $7
+ nop
+ nop
+ bgt $7, $UnAlign_X_ACCESS
+
+ .align 4
+
+ VLD $f0, 0*VEC_LEN*SIZE($18)
+ VLD $f1, 1*VEC_LEN*SIZE($18)
+ VLD $f2, 2*VEC_LEN*SIZE($18)
+ VLD $f3, 3*VEC_LEN*SIZE($18)
+
+/*
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+ LD $f2, 2*SIZE($18)
+ LD $f3, 3*SIZE($18)
+
+ LD $f4, 4*SIZE($18)
+ LD $f5, 5*SIZE($18)
+ LD $f6, 6*SIZE($18)
+ LD $f7, 7*SIZE($18)
+*/
+
+ VLD $f8, 0*VEC_LEN*SIZE($20)
+ VLD $f28, 1*VEC_LEN*SIZE($20)
+ VLD $f10, 2*VEC_LEN*SIZE($20)
+ VLD $f11, 3*VEC_LEN*SIZE($20)
+
+/*
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ LD $f10, 2*SIZE($20)
+ LD $f11, 3*SIZE($20)
+ LD $f12, 4*SIZE($20)
+ LD $f13, 5*SIZE($20)
+ LD $f14, 6*SIZE($20)
+ LD $f15, 7*SIZE($20)
+*/
+ addl $18, 16*SIZE, $18
+ ble $4, $MainLoopEnd
+ .align 4
+
+$MainLoop:
+/*
+ fillcs PREFETCHSIZE * SIZE($20)
+ fillcs PREFETCHSIZE * SIZE($18)
+*/
+ fillcs PREFETCHSIZE * SIZE($20)
+ fillcs PREFETCHSIZE * SIZE($18)
+
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf $f0, 1, $f4
+ vextf $f0, 3, $f5
+ vextf $f1, 0, $f6
+ vextf $f1, 2, $f7
+
+ vextf $f2, 1, $f12
+ vextf $f2, 3, $f13
+ vextf $f3, 0, $f14
+ vextf $f3, 2, $f15
+
+ vinsf $f4, $f1, 0, $f1
+ vinsf $f5, $f1, 2, $f1
+ vinsf $f6, $f0, 1, $f0
+ vinsf $f7, $f0, 3, $f0
+
+ vinsf $f12, $f3, 0, $f3
+ vinsf $f13, $f3, 2, $f3
+ vinsf $f14, $f2, 1, $f2
+ vinsf $f15, $f2, 3, $f2
+
+/*Compute*/
+ VMUL $f29, $f0, $f20
+ VMUL $f30, $f0, $f21
+ VMUL $f29, $f2, $f22
+ VMUL $f30, $f2, $f23
+
+
+ VMAD1 $f30, $f1, $f20, $f16
+ VMAD2 $f29, $f1, $f21, $f17
+ VMAD1 $f30, $f3, $f22, $f18
+ VMAD2 $f29, $f3, $f23, $f19
+
+ VLD $f0, 0*VEC_LEN*SIZE($18)
+ VLD $f1, 1*VEC_LEN*SIZE($18)
+ VLD $f2, 2*VEC_LEN*SIZE($18)
+ VLD $f3, 3*VEC_LEN*SIZE($18)
+
+/*combine the real & image vector to complex vector*/
+ vextf $f16, 1, $f24
+ vextf $f16, 3, $f25
+ vextf $f17, 0, $f26
+ vextf $f17, 2, $f27
+
+ vextf $f18, 1, $f12
+ vextf $f18, 3, $f13
+ vextf $f19, 0, $f14
+ vextf $f19, 2, $f15
+
+ vinsf $f24, $f17, 0, $f17
+ addl $20, 16*SIZE, $20
+ vinsf $f25, $f17, 2, $f17
+ addl $18, 16*SIZE, $18
+
+ vinsf $f26, $f16, 1, $f16
+ subl $4, 1, $4
+ vinsf $f27, $f16, 3, $f16
+ nop
+
+ vinsf $f12, $f19, 0, $f19
+ vinsf $f13, $f19, 2, $f19
+ vinsf $f14, $f18, 1, $f18
+ vinsf $f15, $f18, 3, $f18
+
+ VADD $f16, $f8, $f16
+ VLD $f8, 0*VEC_LEN*SIZE($20)
+ VADD $f17, $f28, $f17
+ VLD $f28, 1*VEC_LEN*SIZE($20)
+
+ VADD $f18, $f10, $f18
+ VLD $f10, 2*VEC_LEN*SIZE($20)
+ VADD $f19, $f11, $f19
+ VLD $f11, 3*VEC_LEN*SIZE($20)
+
+ VST $f16, -4*VEC_LEN*SIZE($20)
+ VST $f17, -3*VEC_LEN*SIZE($20)
+ VST $f18, -2*VEC_LEN*SIZE($20)
+ VST $f19, -1*VEC_LEN*SIZE($20)
+
+/*
+ MUL $f29, $f0, $f20
+ fillcs 9*SIZE($18)
+ MUL $f30, $f1, $f21
+ unop
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ MUL $f29, $f2, $f24
+ unop
+ MUL $f30, $f3, $f25
+ nop
+
+ MUL $f30, $f2, $f26
+ LD $f2, 2*SIZE($18)
+ MUL $f29, $f3, $f27
+ LD $f3, 3*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ unop
+ MUL $f30, $f4, $f22
+ LD $f4, 4*SIZE($18)
+
+ ADD2 $f26, $f27, $f19
+ addl $20, 8*SIZE, $20
+ MUL $f29, $f5, $f23
+ LD $f5, 5*SIZE($18)
+
+ ADD $f16, $f8, $f16
+ LD $f8, 0*SIZE($20)
+ MUL $f29, $f6, $f24
+ unop
+
+ ADD $f17, $f28, $f17
+ LD $f28, 1*SIZE($20)
+ MUL $f30, $f7, $f25
+ unop
+
+ ADD $f18, $f10, $f18
+ LD $f10, 2*SIZE($20)
+ MUL $f30, $f6, $f26
+ LD $f6, 6*SIZE($18)
+
+ ADD $f19, $f11, $f19
+ LD $f11, 3*SIZE($20)
+ MUL $f29, $f7, $f27
+ LD $f7, 7*SIZE($18)
+
+ ST $f16,-8*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17,-7*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ ST $f18,-6*SIZE($20)
+ ADD1 $f24, $f25, $f18
+ ST $f19,-5*SIZE($20)
+ ADD2 $f26, $f27, $f19
+
+ ADD $f16, $f12, $f16
+ LD $f12, 4*SIZE($20)
+ ADD $f17, $f13, $f17
+ LD $f13, 5*SIZE($20)
+ ADD $f18, $f14, $f18
+ LD $f14, 6*SIZE($20)
+ ADD $f19, $f15, $f19
+ LD $f15, 7*SIZE($20)
+
+ ST $f16,-4*SIZE($20)
+
+ ST $f17,-3*SIZE($20)
+
+
+ ST $f18,-2*SIZE($20)
+ nop
+ ST $f19,-1*SIZE($20)
+*/
+ bgt $4, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf $f0, 1, $f4
+ vextf $f0, 3, $f5
+ vextf $f1, 0, $f6
+ vextf $f1, 2, $f7
+
+ vextf $f2, 1, $f12
+ vextf $f2, 3, $f13
+ vextf $f3, 0, $f14
+ vextf $f3, 2, $f15
+
+ vinsf $f4, $f1, 0, $f1
+ vinsf $f5, $f1, 2, $f1
+ vinsf $f6, $f0, 1, $f0
+ vinsf $f7, $f0, 3, $f0
+
+ vinsf $f12, $f3, 0, $f3
+ vinsf $f13, $f3, 2, $f3
+ vinsf $f14, $f2, 1, $f2
+ vinsf $f15, $f2, 3, $f2
+
+ VMUL $f29, $f0, $f20
+ VMUL $f30, $f0, $f21
+ VMUL $f29, $f2, $f22
+ VMUL $f30, $f2, $f23
+
+ VMAD1 $f30, $f1, $f20, $f16
+ VMAD2 $f29, $f1, $f21, $f17
+ VMAD1 $f30, $f3, $f22, $f18
+ VMAD2 $f29, $f3, $f23, $f19
+
+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/
+ vextf $f16, 1, $f24
+ vextf $f16, 3, $f25
+ vextf $f17, 0, $f26
+ vextf $f17, 2, $f27
+
+ vextf $f18, 1, $f12
+ vextf $f18, 3, $f13
+ vextf $f19, 0, $f14
+ vextf $f19, 2, $f15
+
+ vinsf $f24, $f17, 0, $f17
+ vinsf $f25, $f17, 2, $f17
+ vinsf $f26, $f16, 1, $f16
+ vinsf $f27, $f16, 3, $f16
+
+ vinsf $f12, $f19, 0, $f19
+ vinsf $f13, $f19, 2, $f19
+ vinsf $f14, $f18, 1, $f18
+ vinsf $f15, $f18, 3, $f18
+
+ VADD $f16, $f8, $f16
+ VADD $f17, $f28, $f17
+ VADD $f18, $f10, $f18
+ VADD $f19, $f11, $f19
+
+ VST $f16, 0*VEC_LEN*SIZE($20)
+ VST $f17, 1*VEC_LEN*SIZE($20)
+ VST $f18, 2*VEC_LEN*SIZE($20)
+ VST $f19, 3*VEC_LEN*SIZE($20)
+
+ addl $20, 16*SIZE, $20
+ ble $5, $End
+
+/* MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ MUL $f29, $f2, $f24
+ MUL $f30, $f3, $f25
+ MUL $f30, $f2, $f26
+ MUL $f29, $f3, $f27
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ MUL $f30, $f4, $f22
+ ADD2 $f26, $f27, $f19
+ MUL $f29, $f5, $f23
+
+ ADD $f16, $f8, $f16
+ MUL $f29, $f6, $f24
+ ADD $f17, $f28, $f17
+ MUL $f30, $f7, $f25
+
+ ADD $f18, $f10, $f18
+ MUL $f30, $f6, $f26
+ ADD $f19, $f11, $f19
+ MUL $f29, $f7, $f27
+
+ ST $f16, 0*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17, 1*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ ST $f18, 2*SIZE($20)
+ ADD1 $f24, $f25, $f18
+ ST $f19, 3*SIZE($20)
+ ADD2 $f26, $f27, $f19
+
+ ADD $f16, $f12, $f16
+ ADD $f17, $f13, $f17
+ ADD $f18, $f14, $f18
+ ADD $f19, $f15, $f19
+
+ ST $f16, 4*SIZE($20)
+ ST $f17, 5*SIZE($20)
+ ST $f18, 6*SIZE($20)
+ ST $f19, 7*SIZE($20)
+
+ unop
+ unop
+*/
+ .align 4
+
+$Remain:
+ subl $5, 1, $6
+ ble $5, $End
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ addl $18, 2*SIZE, $18
+ ble $6, $RemainLoopEnd
+ .align 4
+
+$RemainLoop:
+ MUL $f29, $f0, $f20
+ subl $6, 1, $6
+ MUL $f30, $f1, $f21
+ addl $20, 2*SIZE, $20
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, $f16
+ LD $f8, 0*SIZE($20)
+ ADD $f17, $f28, $f17
+ LD $f28, 1*SIZE($20)
+
+ ST $f16,-2*SIZE($20)
+ addl $18, 2*SIZE, $18
+ ST $f17,-1*SIZE($20)
+ bgt $6, $RemainLoop
+ .align 4
+
+$RemainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, $f16
+ ADD $f17, $f28, $f17
+
+ ST $f16, 0*SIZE($20)
+ nop
+ ST $f17, 1*SIZE($20)
+ nop
+ .align 4
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ ldi $sp, 64($sp)
+ ret
+ .align 4
+
+$UnAlign_Y_ACCESS:
+ and $18, (VEC_LEN*SIZE-1), $7
+ nop
+ nop
+ bgt $7, $UnAlign_XY_ACCESS
+ .align 4
+/*
+ Unalign access Y, Align access X
+*/
+
+ VLD_UL $f8, 0*VEC_LEN*SIZE($20)
+ VLD_UH $f12, 1*VEC_LEN*SIZE($20)
+
+ VLD_UL $f28, 1*VEC_LEN*SIZE($20)
+ VLD_UH $f13, 2*VEC_LEN*SIZE($20)
+
+ VLD_UL $f10, 2*VEC_LEN*SIZE($20)
+ VLD_UH $f14, 3*VEC_LEN*SIZE($20)
+
+ VLD_UL $f11, 3*VEC_LEN*SIZE($20)
+ VLD_UH $f15, 4*VEC_LEN*SIZE($20)
+
+ VLD $f0, 0*VEC_LEN*SIZE($18)
+ VLD $f1, 1*VEC_LEN*SIZE($18)
+ VLD $f2, 2*VEC_LEN*SIZE($18)
+ VLD $f3, 3*VEC_LEN*SIZE($18)
+
+ vbisw $f8, $f12, $f8
+ vbisw $f28, $f13, $f28
+ vbisw $f10, $f14, $f10
+ vbisw $f11, $f15, $f11
+
+ addl $18, 16*SIZE, $18
+ ble $4, $UnAlign_Y_MainLoopEnd
+ .align 4
+$UnAlign_Y_MainLoop:
+ fillcs PREFETCHSIZE * SIZE($20)
+ fillcs PREFETCHSIZE * SIZE($18)
+
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf $f0, 1, $f4
+ vextf $f0, 3, $f5
+ vextf $f1, 0, $f6
+ vextf $f1, 2, $f7
+
+ vextf $f2, 1, $f12
+ vextf $f2, 3, $f13
+ vextf $f3, 0, $f14
+ vextf $f3, 2, $f15
+
+ vinsf $f4, $f1, 0, $f1
+ vinsf $f5, $f1, 2, $f1
+ vinsf $f6, $f0, 1, $f0
+ vinsf $f7, $f0, 3, $f0
+
+ vinsf $f12, $f3, 0, $f3
+ vinsf $f13, $f3, 2, $f3
+ vinsf $f14, $f2, 1, $f2
+ vinsf $f15, $f2, 3, $f2
+
+/*Compute*/
+ VMUL $f29, $f0, $f20
+ VMUL $f30, $f0, $f21
+ VMUL $f29, $f2, $f22
+ VMUL $f30, $f2, $f23
+
+
+ VMAD1 $f30, $f1, $f20, $f16
+ VMAD2 $f29, $f1, $f21, $f17
+ VMAD1 $f30, $f3, $f22, $f18
+ VMAD2 $f29, $f3, $f23, $f19
+
+ VLD $f0, 0*VEC_LEN*SIZE($18)
+ VLD $f1, 1*VEC_LEN*SIZE($18)
+ VLD $f2, 2*VEC_LEN*SIZE($18)
+ VLD $f3, 3*VEC_LEN*SIZE($18)
+
+
+/*combine the real & image vector to complex vector*/
+ vextf $f16, 1, $f24
+ vextf $f16, 3, $f25
+ vextf $f17, 0, $f26
+ vextf $f17, 2, $f27
+
+ vextf $f18, 1, $f12
+ vextf $f18, 3, $f13
+ vextf $f19, 0, $f14
+ vextf $f19, 2, $f15
+
+ vinsf $f24, $f17, 0, $f17
+ addl $20, 16*SIZE, $20
+ vinsf $f25, $f17, 2, $f17
+ addl $18, 16*SIZE, $18
+
+ vinsf $f26, $f16, 1, $f16
+ subl $4, 1, $4
+ vinsf $f27, $f16, 3, $f16
+ nop
+
+ vinsf $f12, $f19, 0, $f19
+ vinsf $f13, $f19, 2, $f19
+ vinsf $f14, $f18, 1, $f18
+ vinsf $f15, $f18, 3, $f18
+
+ VADD $f16, $f8, $f16
+ VLD_UL $f8, 0*VEC_LEN*SIZE($20)
+ VLD_UH $f12, 1*VEC_LEN*SIZE($20)
+
+ VADD $f17, $f28, $f17
+ VLD_UL $f28, 1*VEC_LEN*SIZE($20)
+ VLD_UH $f13, 2*VEC_LEN*SIZE($20)
+
+
+ VADD $f18, $f10, $f18
+ VLD_UL $f10, 2*VEC_LEN*SIZE($20)
+ VLD_UH $f14, 3*VEC_LEN*SIZE($20)
+
+ VADD $f19, $f11, $f19
+ VLD_UL $f11, 3*VEC_LEN*SIZE($20)
+ VLD_UH $f15, 4*VEC_LEN*SIZE($20)
+
+
+ vbisw $f8, $f12, $f8
+ VST_UL $f16, -4*VEC_LEN*SIZE($20)
+ VST_UH $f16, -3*VEC_LEN*SIZE($20)
+
+ vbisw $f28, $f13, $f28
+ VST_UL $f17, -3*VEC_LEN*SIZE($20)
+ VST_UH $f17, -2*VEC_LEN*SIZE($20)
+
+ vbisw $f10, $f14, $f10
+ VST_UL $f18, -2*VEC_LEN*SIZE($20)
+ VST_UH $f18, -1*VEC_LEN*SIZE($20)
+
+ vbisw $f11, $f15, $f11
+ VST_UL $f19, -1*VEC_LEN*SIZE($20)
+ VST_UH $f19, 0*VEC_LEN*SIZE($20)
+
+ bgt $4, $UnAlign_Y_MainLoop
+
+$UnAlign_Y_MainLoopEnd:
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf $f0, 1, $f4
+ vextf $f0, 3, $f5
+ vextf $f1, 0, $f6
+ vextf $f1, 2, $f7
+
+ vextf $f2, 1, $f12
+ vextf $f2, 3, $f13
+ vextf $f3, 0, $f14
+ vextf $f3, 2, $f15
+
+ vinsf $f4, $f1, 0, $f1
+ vinsf $f5, $f1, 2, $f1
+ vinsf $f6, $f0, 1, $f0
+ vinsf $f7, $f0, 3, $f0
+
+ vinsf $f12, $f3, 0, $f3
+ vinsf $f13, $f3, 2, $f3
+ vinsf $f14, $f2, 1, $f2
+ vinsf $f15, $f2, 3, $f2
+
+ VMUL $f29, $f0, $f20
+ VMUL $f30, $f0, $f21
+ VMUL $f29, $f2, $f22
+ VMUL $f30, $f2, $f23
+
+ VMAD1 $f30, $f1, $f20, $f16
+ VMAD2 $f29, $f1, $f21, $f17
+ VMAD1 $f30, $f3, $f22, $f18
+ VMAD2 $f29, $f3, $f23, $f19
+
+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/
+ vextf $f16, 1, $f24
+ vextf $f16, 3, $f25
+ vextf $f17, 0, $f26
+ vextf $f17, 2, $f27
+
+ vextf $f18, 1, $f12
+ vextf $f18, 3, $f13
+ vextf $f19, 0, $f14
+ vextf $f19, 2, $f15
+
+ vinsf $f24, $f17, 0, $f17
+ vinsf $f25, $f17, 2, $f17
+ vinsf $f26, $f16, 1, $f16
+ vinsf $f27, $f16, 3, $f16
+
+ vinsf $f12, $f19, 0, $f19
+ vinsf $f13, $f19, 2, $f19
+ vinsf $f14, $f18, 1, $f18
+ vinsf $f15, $f18, 3, $f18
+
+ VADD $f16, $f8, $f16
+ VADD $f17, $f28, $f17
+ VADD $f18, $f10, $f18
+ VADD $f19, $f11, $f19
+
+ VST_UL $f16, 0*VEC_LEN*SIZE($20)
+ VST_UH $f16, 1*VEC_LEN*SIZE($20)
+ VST_UL $f17, 1*VEC_LEN*SIZE($20)
+ VST_UH $f17, 2*VEC_LEN*SIZE($20)
+
+ VST_UL $f18, 2*VEC_LEN*SIZE($20)
+ VST_UH $f18, 3*VEC_LEN*SIZE($20)
+ VST_UL $f19, 3*VEC_LEN*SIZE($20)
+ VST_UH $f19, 4*VEC_LEN*SIZE($20)
+
+ addl $20, 16*SIZE, $20
+ ble $5, $End
+
+ jmp $Remain
+
+ .align 4
+
+
+$UnAlign_X_ACCESS:
+ and $20, (VEC_LEN*SIZE-1), $6
+ nop
+ nop
+ bgt $6, $UnAlign_XY_ACCESS
+
+ .align 4
+/*
+ Unalign access X, Align access Y
+*/
+ VLD_UL $f0, 0*VEC_LEN*SIZE($18)
+ VLD_UH $f4, 1*VEC_LEN*SIZE($18)
+
+ VLD_UL $f1, 1*VEC_LEN*SIZE($18)
+ VLD_UH $f5, 2*VEC_LEN*SIZE($18)
+
+ VLD_UL $f2, 2*VEC_LEN*SIZE($18)
+ VLD_UH $f6, 3*VEC_LEN*SIZE($18)
+
+ VLD_UL $f3, 3*VEC_LEN*SIZE($18)
+ VLD_UH $f7, 4*VEC_LEN*SIZE($18)
+
+ VLD $f8, 0*VEC_LEN*SIZE($20)
+ VLD $f28, 1*VEC_LEN*SIZE($20)
+ VLD $f10, 2*VEC_LEN*SIZE($20)
+ VLD $f11, 3*VEC_LEN*SIZE($20)
+
+ vbisw $f0, $f4, $f0
+ vbisw $f1, $f5, $f1
+ vbisw $f2, $f6, $f2
+ vbisw $f3, $f7, $f3
+
+ addl $18, 16*SIZE, $18
+ ble $4, $UnAlign_X_MainLoopEnd
+ .align 4
+$UnAlign_X_MainLoop:
+ fillcs PREFETCHSIZE * SIZE($20)
+ fillcs PREFETCHSIZE * SIZE($18)
+
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf $f0, 1, $f4
+ vextf $f0, 3, $f5
+ vextf $f1, 0, $f6
+ vextf $f1, 2, $f7
+
+ vextf $f2, 1, $f12
+ vextf $f2, 3, $f13
+ vextf $f3, 0, $f14
+ vextf $f3, 2, $f15
+
+ vinsf $f4, $f1, 0, $f1
+ vinsf $f5, $f1, 2, $f1
+ vinsf $f6, $f0, 1, $f0
+ vinsf $f7, $f0, 3, $f0
+
+ vinsf $f12, $f3, 0, $f3
+ vinsf $f13, $f3, 2, $f3
+ vinsf $f14, $f2, 1, $f2
+ vinsf $f15, $f2, 3, $f2
+
+/*Compute*/
+ VMUL $f29, $f0, $f20
+ VMUL $f30, $f0, $f21
+ VMUL $f29, $f2, $f22
+ VMUL $f30, $f2, $f23
+
+
+ VMAD1 $f30, $f1, $f20, $f16
+ VMAD2 $f29, $f1, $f21, $f17
+ VMAD1 $f30, $f3, $f22, $f18
+ VMAD2 $f29, $f3, $f23, $f19
+/*
+ VLD $f0, 0*VEC_LEN*SIZE($18)
+ VLD $f1, 1*VEC_LEN*SIZE($18)
+ VLD $f2, 2*VEC_LEN*SIZE($18)
+ VLD $f3, 3*VEC_LEN*SIZE($18)
+*/
+ VLD_UL $f0, 0*VEC_LEN*SIZE($18)
+ VLD_UH $f4, 1*VEC_LEN*SIZE($18)
+
+ VLD_UL $f1, 1*VEC_LEN*SIZE($18)
+ VLD_UH $f5, 2*VEC_LEN*SIZE($18)
+
+ VLD_UL $f2, 2*VEC_LEN*SIZE($18)
+ VLD_UH $f6, 3*VEC_LEN*SIZE($18)
+
+ VLD_UL $f3, 3*VEC_LEN*SIZE($18)
+ VLD_UH $f7, 4*VEC_LEN*SIZE($18)
+
+/*combine the real & image vector to complex vector*/
+ vextf $f16, 1, $f24
+ vextf $f16, 3, $f25
+ vextf $f17, 0, $f26
+ vextf $f17, 2, $f27
+
+ vextf $f18, 1, $f12
+ vextf $f18, 3, $f13
+ vextf $f19, 0, $f14
+ vextf $f19, 2, $f15
+
+ vbisw $f0, $f4, $f0
+ vbisw $f1, $f5, $f1
+ vbisw $f2, $f6, $f2
+ vbisw $f3, $f7, $f3
+
+ vinsf $f24, $f17, 0, $f17
+ addl $20, 16*SIZE, $20
+ vinsf $f25, $f17, 2, $f17
+ addl $18, 16*SIZE, $18
+
+ vinsf $f26, $f16, 1, $f16
+ subl $4, 1, $4
+ vinsf $f27, $f16, 3, $f16
+ nop
+
+ vinsf $f12, $f19, 0, $f19
+ vinsf $f13, $f19, 2, $f19
+ vinsf $f14, $f18, 1, $f18
+ vinsf $f15, $f18, 3, $f18
+
+ VADD $f16, $f8, $f16
+ VLD $f8, 0*VEC_LEN*SIZE($20)
+ VADD $f17, $f28, $f17
+ VLD $f28, 1*VEC_LEN*SIZE($20)
+
+ VADD $f18, $f10, $f18
+ VLD $f10, 2*VEC_LEN*SIZE($20)
+ VADD $f19, $f11, $f19
+ VLD $f11, 3*VEC_LEN*SIZE($20)
+
+ VST $f16, -4*VEC_LEN*SIZE($20)
+ VST $f17, -3*VEC_LEN*SIZE($20)
+ VST $f18, -2*VEC_LEN*SIZE($20)
+ VST $f19, -1*VEC_LEN*SIZE($20)
+
+ bgt $4, $UnAlign_X_MainLoop
+ .align 4
+
+$UnAlign_X_MainLoopEnd:
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf $f0, 1, $f4
+ vextf $f0, 3, $f5
+ vextf $f1, 0, $f6
+ vextf $f1, 2, $f7
+
+ vextf $f2, 1, $f12
+ vextf $f2, 3, $f13
+ vextf $f3, 0, $f14
+ vextf $f3, 2, $f15
+
+ vinsf $f4, $f1, 0, $f1
+ vinsf $f5, $f1, 2, $f1
+ vinsf $f6, $f0, 1, $f0
+ vinsf $f7, $f0, 3, $f0
+
+ vinsf $f12, $f3, 0, $f3
+ vinsf $f13, $f3, 2, $f3
+ vinsf $f14, $f2, 1, $f2
+ vinsf $f15, $f2, 3, $f2
+
+ VMUL $f29, $f0, $f20
+ VMUL $f30, $f0, $f21
+ VMUL $f29, $f2, $f22
+ VMUL $f30, $f2, $f23
+
+ VMAD1 $f30, $f1, $f20, $f16
+ VMAD2 $f29, $f1, $f21, $f17
+ VMAD1 $f30, $f3, $f22, $f18
+ VMAD2 $f29, $f3, $f23, $f19
+
+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/
+ vextf $f16, 1, $f24
+ vextf $f16, 3, $f25
+ vextf $f17, 0, $f26
+ vextf $f17, 2, $f27
+
+ vextf $f18, 1, $f12
+ vextf $f18, 3, $f13
+ vextf $f19, 0, $f14
+ vextf $f19, 2, $f15
+
+ vinsf $f24, $f17, 0, $f17
+ vinsf $f25, $f17, 2, $f17
+ vinsf $f26, $f16, 1, $f16
+ vinsf $f27, $f16, 3, $f16
+
+ vinsf $f12, $f19, 0, $f19
+ vinsf $f13, $f19, 2, $f19
+ vinsf $f14, $f18, 1, $f18
+ vinsf $f15, $f18, 3, $f18
+
+ VADD $f16, $f8, $f16
+ VADD $f17, $f28, $f17
+ VADD $f18, $f10, $f18
+ VADD $f19, $f11, $f19
+
+ VST $f16, 0*VEC_LEN*SIZE($20)
+ VST $f17, 1*VEC_LEN*SIZE($20)
+ VST $f18, 2*VEC_LEN*SIZE($20)
+ VST $f19, 3*VEC_LEN*SIZE($20)
+
+ addl $20, 16*SIZE, $20
+ ble $5, $End
+
+ jmp $Remain
+ .align 4
+
+$UnAlign_XY_ACCESS:
+/*
+ Unalign access X & Y
+*/
+ VLD_UL $f0, 0*VEC_LEN*SIZE($18)
+ VLD_UH $f4, 1*VEC_LEN*SIZE($18)
+
+ VLD_UL $f1, 1*VEC_LEN*SIZE($18)
+ VLD_UH $f5, 2*VEC_LEN*SIZE($18)
+
+ VLD_UL $f2, 2*VEC_LEN*SIZE($18)
+ VLD_UH $f6, 3*VEC_LEN*SIZE($18)
+
+ VLD_UL $f3, 3*VEC_LEN*SIZE($18)
+ VLD_UH $f7, 4*VEC_LEN*SIZE($18)
+
+ VLD_UL $f8, 0*VEC_LEN*SIZE($20)
+ VLD_UH $f12, 1*VEC_LEN*SIZE($20)
+
+ VLD_UL $f28, 1*VEC_LEN*SIZE($20)
+ VLD_UH $f13, 2*VEC_LEN*SIZE($20)
+
+ VLD_UL $f10, 2*VEC_LEN*SIZE($20)
+ VLD_UH $f14, 3*VEC_LEN*SIZE($20)
+
+ VLD_UL $f11, 3*VEC_LEN*SIZE($20)
+ VLD_UH $f15, 4*VEC_LEN*SIZE($20)
+
+ vbisw $f0, $f4, $f0
+ vbisw $f1, $f5, $f1
+ vbisw $f2, $f6, $f2
+ vbisw $f3, $f7, $f3
+
+ vbisw $f8, $f12, $f8
+ vbisw $f28, $f13, $f28
+ vbisw $f10, $f14, $f10
+ vbisw $f11, $f15, $f11
+
+ addl $18, 16*SIZE, $18
+ ble $4, $UnAlign_MainLoopEnd
+ .align 4
+
+$UnAlign_MainLoop:
+ fillcs PREFETCHSIZE * SIZE($20)
+ fillcs PREFETCHSIZE * SIZE($18)
+
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf $f0, 1, $f4
+ vextf $f0, 3, $f5
+ vextf $f1, 0, $f6
+ vextf $f1, 2, $f7
+
+ vextf $f2, 1, $f12
+ vextf $f2, 3, $f13
+ vextf $f3, 0, $f14
+ vextf $f3, 2, $f15
+
+ vinsf $f4, $f1, 0, $f1
+ vinsf $f5, $f1, 2, $f1
+ vinsf $f6, $f0, 1, $f0
+ vinsf $f7, $f0, 3, $f0
+
+ vinsf $f12, $f3, 0, $f3
+ vinsf $f13, $f3, 2, $f3
+ vinsf $f14, $f2, 1, $f2
+ vinsf $f15, $f2, 3, $f2
+
+/*Compute*/
+ VMUL $f29, $f0, $f20
+ VMUL $f30, $f0, $f21
+ VMUL $f29, $f2, $f22
+ VMUL $f30, $f2, $f23
+
+
+ VMAD1 $f30, $f1, $f20, $f16
+ VMAD2 $f29, $f1, $f21, $f17
+ VMAD1 $f30, $f3, $f22, $f18
+ VMAD2 $f29, $f3, $f23, $f19
+/*
+ VLD $f0, 0*VEC_LEN*SIZE($18)
+ VLD $f1, 1*VEC_LEN*SIZE($18)
+ VLD $f2, 2*VEC_LEN*SIZE($18)
+ VLD $f3, 3*VEC_LEN*SIZE($18)
+*/
+ VLD_UL $f0, 0*VEC_LEN*SIZE($18)
+ VLD_UH $f4, 1*VEC_LEN*SIZE($18)
+
+ VLD_UL $f1, 1*VEC_LEN*SIZE($18)
+ VLD_UH $f5, 2*VEC_LEN*SIZE($18)
+
+ VLD_UL $f2, 2*VEC_LEN*SIZE($18)
+ VLD_UH $f6, 3*VEC_LEN*SIZE($18)
+
+ VLD_UL $f3, 3*VEC_LEN*SIZE($18)
+ VLD_UH $f7, 4*VEC_LEN*SIZE($18)
+
+/*combine the real & image vector to complex vector*/
+ vextf $f16, 1, $f24
+ vextf $f16, 3, $f25
+ vextf $f17, 0, $f26
+ vextf $f17, 2, $f27
+
+ vextf $f18, 1, $f12
+ vextf $f18, 3, $f13
+ vextf $f19, 0, $f14
+ vextf $f19, 2, $f15
+
+ vbisw $f0, $f4, $f0
+ vbisw $f1, $f5, $f1
+ vbisw $f2, $f6, $f2
+ vbisw $f3, $f7, $f3
+
+ vinsf $f24, $f17, 0, $f17
+ addl $20, 16*SIZE, $20
+ vinsf $f25, $f17, 2, $f17
+ addl $18, 16*SIZE, $18
+
+ vinsf $f26, $f16, 1, $f16
+ subl $4, 1, $4
+ vinsf $f27, $f16, 3, $f16
+ nop
+
+ vinsf $f12, $f19, 0, $f19
+ vinsf $f13, $f19, 2, $f19
+ vinsf $f14, $f18, 1, $f18
+ vinsf $f15, $f18, 3, $f18
+
+ VADD $f16, $f8, $f16
+ VLD_UL $f8, 0*VEC_LEN*SIZE($20)
+ VLD_UH $f12, 1*VEC_LEN*SIZE($20)
+
+ VADD $f17, $f28, $f17
+ VLD_UL $f28, 1*VEC_LEN*SIZE($20)
+ VLD_UH $f13, 2*VEC_LEN*SIZE($20)
+
+
+ VADD $f18, $f10, $f18
+ VLD_UL $f10, 2*VEC_LEN*SIZE($20)
+ VLD_UH $f14, 3*VEC_LEN*SIZE($20)
+
+ VADD $f19, $f11, $f19
+ VLD_UL $f11, 3*VEC_LEN*SIZE($20)
+ VLD_UH $f15, 4*VEC_LEN*SIZE($20)
+
+/*
+ VST $f16, -4*VEC_LEN*SIZE($20)
+ VST $f17, -3*VEC_LEN*SIZE($20)
+ VST $f18, -2*VEC_LEN*SIZE($20)
+ VST $f19, -1*VEC_LEN*SIZE($20)
+*/
+
+ vbisw $f8, $f12, $f8
+ VST_UL $f16, -4*VEC_LEN*SIZE($20)
+ VST_UH $f16, -3*VEC_LEN*SIZE($20)
+
+ vbisw $f28, $f13, $f28
+ VST_UL $f17, -3*VEC_LEN*SIZE($20)
+ VST_UH $f17, -2*VEC_LEN*SIZE($20)
+
+ vbisw $f10, $f14, $f10
+ VST_UL $f18, -2*VEC_LEN*SIZE($20)
+ VST_UH $f18, -1*VEC_LEN*SIZE($20)
+
+ vbisw $f11, $f15, $f11
+ VST_UL $f19, -1*VEC_LEN*SIZE($20)
+ VST_UH $f19, 0*VEC_LEN*SIZE($20)
+
+ bgt $4, $UnAlign_MainLoop
+ .align 4
+
+$UnAlign_MainLoopEnd:
+
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf $f0, 1, $f4
+ vextf $f0, 3, $f5
+ vextf $f1, 0, $f6
+ vextf $f1, 2, $f7
+
+ vextf $f2, 1, $f12
+ vextf $f2, 3, $f13
+ vextf $f3, 0, $f14
+ vextf $f3, 2, $f15
+
+ vinsf $f4, $f1, 0, $f1
+ vinsf $f5, $f1, 2, $f1
+ vinsf $f6, $f0, 1, $f0
+ vinsf $f7, $f0, 3, $f0
+
+ vinsf $f12, $f3, 0, $f3
+ vinsf $f13, $f3, 2, $f3
+ vinsf $f14, $f2, 1, $f2
+ vinsf $f15, $f2, 3, $f2
+
+ VMUL $f29, $f0, $f20
+ VMUL $f30, $f0, $f21
+ VMUL $f29, $f2, $f22
+ VMUL $f30, $f2, $f23
+
+ VMAD1 $f30, $f1, $f20, $f16
+ VMAD2 $f29, $f1, $f21, $f17
+ VMAD1 $f30, $f3, $f22, $f18
+ VMAD2 $f29, $f3, $f23, $f19
+
+/*combine the real(f16,f18) & image(f17,f19) vector to complex vector*/
+ vextf $f16, 1, $f24
+ vextf $f16, 3, $f25
+ vextf $f17, 0, $f26
+ vextf $f17, 2, $f27
+
+ vextf $f18, 1, $f12
+ vextf $f18, 3, $f13
+ vextf $f19, 0, $f14
+ vextf $f19, 2, $f15
+
+ vinsf $f24, $f17, 0, $f17
+ vinsf $f25, $f17, 2, $f17
+ vinsf $f26, $f16, 1, $f16
+ vinsf $f27, $f16, 3, $f16
+
+ vinsf $f12, $f19, 0, $f19
+ vinsf $f13, $f19, 2, $f19
+ vinsf $f14, $f18, 1, $f18
+ vinsf $f15, $f18, 3, $f18
+
+ VADD $f16, $f8, $f16
+ VADD $f17, $f28, $f17
+ VADD $f18, $f10, $f18
+ VADD $f19, $f11, $f19
+
+ VST_UL $f16, 0*VEC_LEN*SIZE($20)
+ VST_UH $f16, 1*VEC_LEN*SIZE($20)
+ VST_UL $f17, 1*VEC_LEN*SIZE($20)
+ VST_UH $f17, 2*VEC_LEN*SIZE($20)
+
+ VST_UL $f18, 2*VEC_LEN*SIZE($20)
+ VST_UH $f18, 3*VEC_LEN*SIZE($20)
+ VST_UL $f19, 3*VEC_LEN*SIZE($20)
+ VST_UH $f19, 4*VEC_LEN*SIZE($20)
+
+ addl $20, 16*SIZE, $20
+ ble $5, $End
+
+ jmp $Remain
+ .align 4
+/*Unloop 4 complex = 8 float/double*/
+$Sub:
+ sra $16, 2, $4
+ and $16, 3, $5
+ SXSUBL $16, SIZE, $22
+ addl $22, $22, $22 # Complex
+ .align 4
+
+ addl $19, $19, $19 # Complex
+ addl $21, $21, $21 # Complex
+
+ ble $4, $SubRemain
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f2, 0*SIZE($18)
+ LD $f3, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f4, 0*SIZE($18)
+ LD $f5, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f6, 0*SIZE($18)
+ LD $f7, 1*SIZE($18)
+ SXADDQ $19, $18, $18
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ SXADDQ $21, $20, $24
+
+ LD $f10, 0*SIZE($24)
+ LD $f11, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ LD $f12, 0*SIZE($24)
+ LD $f13, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ LD $f14, 0*SIZE($24)
+ LD $f15, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ subl $4, 1, $4
+ ble $4, $SubMainLoopEnd
+ .align 4
+
+$SubMainLoop:
+ MUL $f29, $f0, $f20
+ unop
+ MUL $f30, $f1, $f21
+ unop
+
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+
+ MUL $f29, $f2, $f24
+ SXADDQ $19, $18, $18
+ MUL $f30, $f3, $f25
+ unop
+
+ MUL $f30, $f2, $f26
+ LD $f2, 0*SIZE($18)
+ MUL $f29, $f3, $f27
+ LD $f3, 1*SIZE($18)
+
+ ADD1 $f20, $f21, $f16
+ SXADDQ $19, $18, $18
+ MUL $f29, $f4, $f20
+ unop
+
+ ADD2 $f22, $f23, $f17
+ unop
+ MUL $f30, $f5, $f21
+ unop
+
+ ADD1 $f24, $f25, $f18
+ unop
+ MUL $f30, $f4, $f22
+ LD $f4, 0*SIZE($18)
+
+ ADD2 $f26, $f27, $f19
+ unop
+ MUL $f29, $f5, $f23
+ LD $f5, 1*SIZE($18)
+
+ ADD $f16, $f8, $f16
+ LD $f8, 0*SIZE($24)
+ MUL $f29, $f6, $f24
+ SXADDQ $19, $18, $18
+
+ ADD $f17, $f28, $f17
+ LD $f28, 1*SIZE($24)
+ MUL $f30, $f7, $f25
+ SXADDQ $21, $24, $24
+
+ ADD $f18, $f10, $f18
+ LD $f10, 0*SIZE($24)
+ MUL $f30, $f6, $f26
+ LD $f6, 0*SIZE($18)
+
+ ADD $f19, $f11, $f19
+ LD $f11, 1*SIZE($24)
+ MUL $f29, $f7, $f27
+ LD $f7, 1*SIZE($18)
+
+ ST $f16, 0*SIZE($20)
+ SXADDQ $19, $18, $18
+ ADD1 $f20, $f21, $f16
+ unop
+
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ ADD2 $f22, $f23, $f17
+ unop
+
+ ST $f18, 0*SIZE($20)
+ SXADDQ $21, $24, $24
+ ADD1 $f24, $f25, $f18
+ unop
+
+ ST $f19, 1*SIZE($20)
+ unop
+ ADD2 $f26, $f27, $f19
+ SXADDQ $21, $20, $20
+
+ ADD $f16, $f12, $f16
+ unop
+ LD $f12, 0*SIZE($24)
+ unop
+
+ ADD $f17, $f13, $f17
+ unop
+ LD $f13, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ ADD $f18, $f14, $f18
+ subl $4, 1, $4
+ LD $f14, 0*SIZE($24)
+ unop
+
+ ADD $f19, $f15, $f19
+ unop
+ LD $f15, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ unop
+
+ ST $f18, 0*SIZE($20)
+ ST $f19, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ bgt $4, $SubMainLoop
+ .align 4
+
+$SubMainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ MUL $f29, $f2, $f24
+ MUL $f30, $f3, $f25
+ MUL $f30, $f2, $f26
+ MUL $f29, $f3, $f27
+
+ ADD1 $f20, $f21, $f16
+ MUL $f29, $f4, $f20
+ ADD2 $f22, $f23, $f17
+ MUL $f30, $f5, $f21
+
+ ADD1 $f24, $f25, $f18
+ MUL $f30, $f4, $f22
+ ADD2 $f26, $f27, $f19
+ MUL $f29, $f5, $f23
+
+ ADD $f16, $f8, $f16
+ MUL $f29, $f6, $f24
+ ADD $f17, $f28, $f17
+ MUL $f30, $f7, $f25
+
+ ADD $f18, $f10, $f18
+ MUL $f30, $f6, $f26
+ ADD $f19, $f11, $f19
+ MUL $f29, $f7, $f27
+
+ ST $f16, 0*SIZE($20)
+ ADD1 $f20, $f21, $f16
+ ST $f17, 1*SIZE($20)
+ ADD2 $f22, $f23, $f17
+
+ SXADDQ $21, $20, $20
+ nop
+ ST $f18, 0*SIZE($20)
+ ADD1 $f24, $f25, $f18
+
+ ST $f19, 1*SIZE($20)
+ ADD2 $f26, $f27, $f19
+ SXADDQ $21, $20, $20
+ ADD $f16, $f12, $f16
+
+ ADD $f17, $f13, $f17
+ ADD $f18, $f14, $f18
+ ADD $f19, $f15, $f19
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+
+ ST $f18, 0*SIZE($20)
+ ST $f19, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ ble $5, $SubEnd
+ .align 4
+
+$SubRemain:
+ subl $5, 1, $6
+ ble $5, $SubEnd
+ LD $f0, 0*SIZE($18)
+ LD $f1, 1*SIZE($18)
+
+ LD $f8, 0*SIZE($20)
+ LD $f28, 1*SIZE($20)
+ SXADDQ $19, $18, $18
+ SXADDQ $21, $20, $24
+ ble $6, $SubRemainLoopEnd
+ .align 4
+
+$SubRemainLoop:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ LD $f0, 0*SIZE($18)
+
+ MUL $f29, $f1, $f23
+ LD $f1, 1*SIZE($18)
+ ADD1 $f20, $f21, $f16
+ SXADDQ $19, $18, $18
+
+ ADD2 $f22, $f23, $f17
+ nop
+ ADD $f16, $f8, $f16
+ LD $f8, 0*SIZE($24)
+
+ ADD $f17, $f28, $f17
+ LD $f28, 1*SIZE($24)
+ SXADDQ $21, $24, $24
+ subl $6, 1, $6
+
+ ST $f16, 0*SIZE($20)
+ ST $f17, 1*SIZE($20)
+ SXADDQ $21, $20, $20
+ bgt $6, $SubRemainLoop
+ .align 4
+
+$SubRemainLoopEnd:
+ MUL $f29, $f0, $f20
+ MUL $f30, $f1, $f21
+ MUL $f30, $f0, $f22
+ MUL $f29, $f1, $f23
+
+ ADD1 $f20, $f21, $f16
+ ADD2 $f22, $f23, $f17
+ ADD $f16, $f8, $f16
+ ADD $f17, $f28, $f17
+
+ ST $f16, 0*SIZE($20)
+ nop
+ ST $f17, 1*SIZE($20)
+ nop
+ .align 4
+
+$SubEnd:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ ldi $sp, 64($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zdot.S b/kernel/sw_64/zdot.S
new file mode 100644
index 0000000..114a7a3
--- /dev/null
+++ b/kernel/sw_64/zdot.S
@@ -0,0 +1,583 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define XX $21
+#define YY $23
+
+#define I $5
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f2
+#define s3 $f30
+#define s4 $f3
+
+#define a0 $f10
+#define a1 $f11
+#define a2 $f12
+#define a3 $f13
+#define a4 $f14
+#define a5 $f15
+#define a6 $f16
+#define a7 $f17
+
+#define b0 $f18
+#define b1 $f19
+#define b2 $f20
+#define b3 $f21
+#define b4 $f22
+#define b5 $f23
+#define b6 $f24
+#define b7 $f25
+
+#define t0 $f26
+#define t1 $f27
+#define t2 $f28
+#define t3 $f29
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 24, $26, 0
+
+ ldi $sp, -24($sp)
+ fclr s0
+ fstd $f2, 0($sp)
+ fstd $f3, 16($sp)
+ fclr s1
+
+ fclr s2
+ addl INCX, INCX, INCX
+ fclr s3
+ ble N, $L999
+
+ addl INCY, INCY, INCY
+ fclr t0
+ fclr t1
+ fclr t2
+ fclr t3
+
+ srl N, 3, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ LD a3, 1 * SIZE(X)
+ LD b2, 0 * SIZE(Y)
+ LD b3, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ LD b4, 0 * SIZE(Y)
+ LD b5, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a6, 0 * SIZE(X)
+ LD b6, 0 * SIZE(Y)
+
+ subl I, 1, I
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD a7, 1 * SIZE(X)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ fillcs PREFETCHSIZE * SIZE(X)
+ MUL a0, b1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ fillcs PREFETCHSIZE * SIZE(Y)
+ MUL a1, b0, t2
+ SXADDQ INCY, Y, Y
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ #unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a5, b4, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a6, 0 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b6, 0 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a1, b0, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a5, b4, t2
+ subl I, 1, I
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ LD a6, 0 * SIZE(X)
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD b6, 0 * SIZE(Y)
+ MUL a7, b7, t3
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD a7, 1 * SIZE(X)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a1, b0, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a5, b4, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ unop
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ LD a6, 0 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, 1 * SIZE(X)
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ LD b6, 0 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ MUL a1, b0, t2
+ ADD s3, t3, s4
+ fmov s4,s3
+ MUL a1, b1, t3
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ MUL a2, b2, t0
+ ADD s1, t1, s4
+ fmov s4,s1
+ MUL a2, b3, t1
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ MUL a3, b2, t2
+ ADD s3, t3, s4
+ fmov s4,s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ MUL a4, b4, t0
+ ADD s1, t1, s4
+ fmov s4,s1
+ MUL a4, b5, t1
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ MUL a5, b4, t2
+ ADD s3, t3, s4
+ fmov s4,s3
+ MUL a5, b5, t3
+
+ ADD s0, t0, s4
+ fmov s4,s0
+ MUL a6, b6, t0
+ ADD s1, t1, s4
+ fmov s4,s1
+ MUL a6, b7, t1
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ MUL a7, b6, t2
+ ADD s3, t3, s4
+ fmov s4,s3
+ MUL a7, b7, t3
+ .align 4
+
+$L25:
+ and N, 7, I
+ unop
+ unop
+ ble I, $L998
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ subl I, 1, I
+ SXADDQ INCY, Y, Y
+ ble I, $L28
+ .align 4
+
+$L26:
+ ADD s0, t0, s4
+ fmov s4,s0
+ mov X, XX
+ MUL a0, b0, t0
+ mov Y, YY
+
+ ADD s1, t1, s4
+ fmov s4,s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ LD a0, 0 * SIZE(XX)
+ MUL a1, b0, t2
+ LD b0, 0 * SIZE(YY)
+
+ ADD s3, t3, s4
+ fmov s4,s3
+ subl I, 1, I
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(XX)
+
+ LD b1, 1 * SIZE(YY)
+ bgt I, $L26
+ .align 4
+
+$L28:
+ ADD s0, t0, s4
+ fmov s4,s0
+ MUL a0, b0, t0
+ ADD s1, t1, s4
+ fmov s4,s1
+ MUL a0, b1, t1
+
+ ADD s2, t2, s4
+ fmov s4,s2
+ MUL a1, b0, t2
+ ADD s3, t3, s4
+ fmov s4,s3
+ MUL a1, b1, t3
+ .align 4
+
+$L998:
+ ADD s0, t0, s4
+ fmov s4,s0
+ ADD s1, t1, s4
+ fmov s4,s1
+ ADD s2, t2, s4
+ fmov s4,s2
+ ADD s3, t3, s4
+ fmov s4,s3
+
+#ifndef CONJ
+ SUB s0, s3, s4
+ fmov s4,s0
+ ADD s1, s2, s4
+ fmov s4,s1
+#else
+ ADD s0, s3, s4
+ fmov s4,s0
+ SUB s1, s2, s4
+ fmov s4,s1
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 16($sp)
+ ldi $sp, 24($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/zdot.S.bak b/kernel/sw_64/zdot.S.bak
new file mode 100644
index 0000000..d10673c
--- /dev/null
+++ b/kernel/sw_64/zdot.S.bak
@@ -0,0 +1,500 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define XX $21
+#define YY $23
+
+#define I $5
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f2
+#define s3 $f30
+
+#define a0 $f10
+#define a1 $f11
+#define a2 $f12
+#define a3 $f13
+#define a4 $f14
+#define a5 $f15
+#define a6 $f16
+#define a7 $f17
+
+#define b0 $f18
+#define b1 $f19
+#define b2 $f20
+#define b3 $f21
+#define b4 $f22
+#define b5 $f23
+#define b6 $f24
+#define b7 $f25
+
+#define t0 $f26
+#define t1 $f27
+#define t2 $f28
+#define t3 $f29
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 16, $26, 0
+
+ ldi $sp, -16($sp)
+ fclr s0
+ fstd $f2, 0($sp)
+ fclr s1
+
+ fclr s2
+ addl INCX, INCX, INCX
+ fclr s3
+ ble N, $L999
+
+ addl INCY, INCY, INCY
+ fclr t0
+ fclr t1
+ fclr t2
+ fclr t3
+
+ srl N, 3, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ LD a3, 1 * SIZE(X)
+ LD b2, 0 * SIZE(Y)
+ LD b3, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ LD b4, 0 * SIZE(Y)
+ LD b5, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a6, 0 * SIZE(X)
+ LD b6, 0 * SIZE(Y)
+
+ subl I, 1, I
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD s0, t0, s0
+ LD a7, 1 * SIZE(X)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ fillcs PREFETCHSIZE * SIZE(X)
+ MUL a0, b1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ fillcs PREFETCHSIZE * SIZE(Y)
+ MUL a1, b0, t2
+ SXADDQ INCY, Y, Y
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a5, b4, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a6, 0 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b6, 0 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a1, b0, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a5, b4, t2
+ subl I, 1, I
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ LD a6, 0 * SIZE(X)
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s3
+ LD b6, 0 * SIZE(Y)
+ MUL a7, b7, t3
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD s0, t0, s0
+ LD a7, 1 * SIZE(X)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a1, b0, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a5, b4, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a6, 0 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b6, 0 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ MUL a1, b0, t2
+ ADD s3, t3, s3
+ MUL a1, b1, t3
+
+ ADD s0, t0, s0
+ MUL a2, b2, t0
+ ADD s1, t1, s1
+ MUL a2, b3, t1
+
+ ADD s2, t2, s2
+ MUL a3, b2, t2
+ ADD s3, t3, s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, s0
+ MUL a4, b4, t0
+ ADD s1, t1, s1
+ MUL a4, b5, t1
+
+ ADD s2, t2, s2
+ MUL a5, b4, t2
+ ADD s3, t3, s3
+ MUL a5, b5, t3
+
+ ADD s0, t0, s0
+ MUL a6, b6, t0
+ ADD s1, t1, s1
+ MUL a6, b7, t1
+
+ ADD s2, t2, s2
+ MUL a7, b6, t2
+ ADD s3, t3, s3
+ MUL a7, b7, t3
+ .align 4
+
+$L25:
+ and N, 7, I
+ unop
+ unop
+ ble I, $L998
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ subl I, 1, I
+ SXADDQ INCY, Y, Y
+ ble I, $L28
+ .align 4
+
+$L26:
+ ADD s0, t0, s0
+ mov X, XX
+ MUL a0, b0, t0
+ mov Y, YY
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ LD a0, 0 * SIZE(XX)
+ MUL a1, b0, t2
+ LD b0, 0 * SIZE(YY)
+
+ ADD s3, t3, s3
+ subl I, 1, I
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(XX)
+
+ LD b1, 1 * SIZE(YY)
+ bgt I, $L26
+ .align 4
+
+$L28:
+ ADD s0, t0, s0
+ MUL a0, b0, t0
+ ADD s1, t1, s1
+ MUL a0, b1, t1
+
+ ADD s2, t2, s2
+ MUL a1, b0, t2
+ ADD s3, t3, s3
+ MUL a1, b1, t3
+ .align 4
+
+$L998:
+ ADD s0, t0, s0
+ ADD s1, t1, s1
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+#ifndef CONJ
+ SUB s0, s3, s0
+ ADD s1, s2, s1
+#else
+ ADD s0, s3, s0
+ SUB s1, s2, s1
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ ldi $sp, 16($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/zdot_simd.S b/kernel/sw_64/zdot_simd.S
new file mode 100644
index 0000000..ed775e6
--- /dev/null
+++ b/kernel/sw_64/zdot_simd.S
@@ -0,0 +1,699 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define XX $21
+#define YY $23
+
+#define I $5
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f2
+#define s3 $f30
+
+#define a0 $f10
+#define a1 $f11
+#define a2 $f12
+#define a3 $f13
+#define a4 $f14
+#define a5 $f15
+#define a6 $f16
+#define a7 $f17
+
+#define b0 $f18
+#define b1 $f19
+#define b2 $f20
+#define b3 $f21
+#define b4 $f22
+#define b5 $f23
+#define b6 $f24
+#define b7 $f25
+
+#define t0 $f26
+#define t1 $f27
+#define t2 $f28
+#define t3 $f29
+
+#define t4 $f3
+#define t5 $f4
+#define t6 $f5
+#define t7 $f6
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 40, $26, 0
+
+ ldi $sp, -40($sp)
+ fclr s0
+ fstd $f2, 0($sp)
+ fclr s1
+
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+
+ fclr s2
+ addl INCX, INCX, INCX
+ fclr s3
+ ble N, $L999
+
+ addl INCY, INCY, INCY
+ fclr t0
+ fclr t1
+ fclr t2
+ fclr t3
+
+ cmpeq INCX, 2, $21
+ cmpeq INCY, 2, $22
+ and $21, $22, $22
+ beq $22, $Sub
+
+/*
+ test the address of Y & X
+*/
+ and Y, (VEC_LEN*SIZE-1), $4
+ and X, (VEC_LEN*SIZE-1), $3
+ or $3, $4, $4
+ bne $4, $UnAlign_ACCESS
+
+/*Align access*/
+/*UnLoop 8*/
+ srl N, 3, I
+ ble I, $Remain
+ .align 4
+ vcpys $f31, $f31, s0 #clear s0 vector
+ vcpys $f31, $f31, s1 #clear s0 vector
+ vcpys $f31, $f31, s2 #clear s0 vector
+ vcpys $f31, $f31, s3 #clear s0 vector
+
+ vcpys $f31, $f31, t0
+ vcpys $f31, $f31, t1
+ vcpys $f31, $f31, t2
+ vcpys $f31, $f31, t3
+
+$MainLoop:
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ VLD a1, 1*VEC_LEN*SIZE(X)
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ VLD a3, 3*VEC_LEN*SIZE(X)
+
+ VLD b0, 0*VEC_LEN*SIZE(Y)
+ VADD s0, t0, s0
+ VLD b1, 1*VEC_LEN*SIZE(Y)
+ VADD s1, t1, s1
+
+ VLD b2, 2*VEC_LEN*SIZE(Y)
+ VADD s2, t2, s2
+ VLD b3, 3*VEC_LEN*SIZE(Y)
+ VADD s3, t3, s3
+
+/*spilt the X complex vector to real vector(a0, a2) and image vector (a1, a3)
+ Y complex vectory to real vector(b0, b2) and image vector (b1, b3)
+*/
+ vextf a0, 1, a4
+ vextf a0, 3, a5
+ vextf a1, 0, a6
+ vextf a1, 2, a7
+
+ vextf a2, 1, t0
+ vextf a2, 3, t1
+ vextf a3, 0, t2
+ vextf a3, 2, t3
+
+ vextf b0, 1, b4
+ vextf b0, 3, b5
+ vextf b1, 0, b6
+ vextf b1, 2, b7
+
+ vextf b2, 1, t4
+ vextf b2, 3, t5
+ vextf b3, 0, t6
+ vextf b3, 2, t7
+
+ vinsf a4, a1, 0, a1
+ vinsf a6, a0, 1, a0
+ vinsf t0, a3, 0, a3
+ vinsf t2, a2, 1, a2
+
+ vinsf b4, b1, 0, b1
+ addl X, 16 * SIZE, X
+ vinsf b6, b0, 1, b0
+ addl Y, 16 * SIZE, Y
+
+ vinsf t4, b3, 0, b3
+ subl I, 1, I
+ vinsf t6, b2, 1, b2
+ nop
+
+ vinsf a5, a1, 2, a1
+ vinsf a7, a0, 3, a0
+ vinsf t1, a3, 2, a3
+ vinsf t3, a2, 3, a2
+
+ vinsf b5, b1, 2, b1
+ vinsf b7, b0, 3, b0
+ vinsf t5, b3, 2, b3
+ vinsf t7, b2, 3, b2
+
+ /*Computing*/
+
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ VMAD a0, b0, s0, s0
+ fillcs PREFETCHSIZE * SIZE(Y)
+ VMAD a0, b1, s1, s1
+
+ VMAD a1, b0, s2, s2
+ VMAD a1, b1, s3, s3
+ VMUL a2, b2, t0 /*Just multiply. Add it in next loop.*/
+ VMUL a2, b3, t1
+
+ VMUL a3, b2, t2
+ VMUL a3, b3, t3
+ nop
+ bgt I, $MainLoop
+ .align 4
+$MainLoopEnd:
+ VADD s0, t0, s0
+ VADD s1, t1, s1
+ VADD s2, t2, s2
+ VADD s3, t3, s3
+
+#ifndef CONJ
+ VSUB s0, s3, s0
+ VADD s1, s2, s1
+#else
+ VADD s0, s3, s0
+ VSUB s1, s2, s1
+#endif
+ vcpys $f31, $f31, s2 #clear s0 vector
+ vcpys $f31, $f31, s3 #clear s0 vector
+
+ vextf s0, 1, t1
+ vextf s0, 2, t2
+ vextf s0, 3, t3
+ vextf s1, 1, t5
+
+ vextf s1, 2, t6
+ vextf s1, 3, t7
+ ADD s0, t1, s0
+ ADD t2, t3, t0
+
+ ADD s1, t5, s1
+ ADD t6, t7, t4
+ ADD s0, t0, s0
+ ADD s1, t4, s1
+$Remain:
+ and N, 7, I
+ ble I, $End
+ .align 4
+$RemainLoop:
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ subl I, 1, I
+ SXADDQ INCY, Y, Y
+ MAD a0, b0, s0, s0
+
+ MAD a0, b1, s1, s1
+ MAD a1, b0, s2, s2
+ MAD a1, b1, s3, s3
+ bgt I, $RemainLoop
+ .align 4
+
+#ifndef CONJ
+ SUB s0, s3, s0
+ ADD s1, s2, s1
+#else
+ ADD s0, s3, s0
+ SUB s1, s2, s1
+#endif
+
+$End:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ ldi $sp, 40($sp)
+ ret
+
+ .align 4
+
+$UnAlign_ACCESS:
+$Sub:
+ srl N, 3, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a2, 0 * SIZE(X)
+ LD a3, 1 * SIZE(X)
+ LD b2, 0 * SIZE(Y)
+ LD b3, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ LD b4, 0 * SIZE(Y)
+ LD b5, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ LD a6, 0 * SIZE(X)
+ LD b6, 0 * SIZE(Y)
+
+ subl I, 1, I
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD s0, t0, s0
+ LD a7, 1 * SIZE(X)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ fillcs PREFETCHSIZE * SIZE(X)
+ MUL a0, b1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, s2
+ fillcs PREFETCHSIZE * SIZE(Y)
+ MUL a1, b0, t2
+ SXADDQ INCY, Y, Y
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a5, b4, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a6, 0 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b6, 0 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a1, b0, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a5, b4, t2
+ subl I, 1, I
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ LD a6, 0 * SIZE(X)
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s3
+ LD b6, 0 * SIZE(Y)
+ MUL a7, b7, t3
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD s0, t0, s0
+ LD a7, 1 * SIZE(X)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a1, b0, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a0, 0 * SIZE(X)
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b0, 0 * SIZE(Y)
+ MUL a2, b2, t0
+ LD b1, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a2, b3, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a3, b2, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a2, 0 * SIZE(X)
+ MUL a3, b3, t3
+ LD a3, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b2, 0 * SIZE(Y)
+ MUL a4, b4, t0
+ LD b3, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a4, b5, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a5, b4, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a4, 0 * SIZE(X)
+ MUL a5, b5, t3
+ LD a5, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b4, 0 * SIZE(Y)
+ MUL a6, b6, t0
+ LD b5, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a6, b7, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ unop
+ MUL a7, b6, t2
+ unop
+
+ ADD s3, t3, s3
+ LD a6, 0 * SIZE(X)
+ MUL a7, b7, t3
+ LD a7, 1 * SIZE(X)
+
+ ADD s0, t0, s0
+ LD b6, 0 * SIZE(Y)
+ MUL a0, b0, t0
+ LD b7, 1 * SIZE(Y)
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ MUL a1, b0, t2
+ ADD s3, t3, s3
+ MUL a1, b1, t3
+
+ ADD s0, t0, s0
+ MUL a2, b2, t0
+ ADD s1, t1, s1
+ MUL a2, b3, t1
+
+ ADD s2, t2, s2
+ MUL a3, b2, t2
+ ADD s3, t3, s3
+ MUL a3, b3, t3
+
+ ADD s0, t0, s0
+ MUL a4, b4, t0
+ ADD s1, t1, s1
+ MUL a4, b5, t1
+
+ ADD s2, t2, s2
+ MUL a5, b4, t2
+ ADD s3, t3, s3
+ MUL a5, b5, t3
+
+ ADD s0, t0, s0
+ MUL a6, b6, t0
+ ADD s1, t1, s1
+ MUL a6, b7, t1
+
+ ADD s2, t2, s2
+ MUL a7, b6, t2
+ ADD s3, t3, s3
+ MUL a7, b7, t3
+ .align 4
+
+$L25:
+ and N, 7, I
+ unop
+ unop
+ ble I, $L998
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ LD b0, 0 * SIZE(Y)
+ LD b1, 1 * SIZE(Y)
+
+ SXADDQ INCX, X, X
+ subl I, 1, I
+ SXADDQ INCY, Y, Y
+ ble I, $L28
+ .align 4
+
+$L26:
+ ADD s0, t0, s0
+ mov X, XX
+ MUL a0, b0, t0
+ mov Y, YY
+
+ ADD s1, t1, s1
+ SXADDQ INCX, X, X
+ MUL a0, b1, t1
+ SXADDQ INCY, Y, Y
+
+ ADD s2, t2, s2
+ LD a0, 0 * SIZE(XX)
+ MUL a1, b0, t2
+ LD b0, 0 * SIZE(YY)
+
+ ADD s3, t3, s3
+ subl I, 1, I
+ MUL a1, b1, t3
+ LD a1, 1 * SIZE(XX)
+
+ LD b1, 1 * SIZE(YY)
+ bgt I, $L26
+ .align 4
+
+$L28:
+ ADD s0, t0, s0
+ MUL a0, b0, t0
+ ADD s1, t1, s1
+ MUL a0, b1, t1
+
+ ADD s2, t2, s2
+ MUL a1, b0, t2
+ ADD s3, t3, s3
+ MUL a1, b1, t3
+ .align 4
+
+$L998:
+ ADD s0, t0, s0
+ ADD s1, t1, s1
+ ADD s2, t2, s2
+ ADD s3, t3, s3
+
+#ifndef CONJ
+ SUB s0, s3, s0
+ ADD s1, s2, s1
+#else
+ ADD s0, s3, s0
+ SUB s1, s2, s1
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+
+ fldd $f6, 32($sp)
+ ldi $sp, 40($sp)
+ ret
+
+ EPILOGUE
diff --git a/kernel/sw_64/zgemm_beta.S b/kernel/sw_64/zgemm_beta.S
new file mode 100644
index 0000000..18f845c
--- /dev/null
+++ b/kernel/sw_64/zgemm_beta.S
@@ -0,0 +1,192 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+ .set noat
+ .set noreorder
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+CNAME:
+ .frame $sp, 0, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $28, _mcount
+ jsr $28, ($28), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ ldl $18, 24($sp)
+ ble $16, $End
+ ldl $19, 32($sp)
+ ble $17, $End
+
+ addl $19, $19, $19
+ fbne $f19,$Main
+ fbne $f20,$Main
+ .align 4
+
+$L13:
+ mov $18, $1
+ ldi $17, -1($17)
+ SXADDQ $19, $18, $18
+ mov $16, $2
+ .align 4
+
+$L12:
+ ST $f31, 0*SIZE($1)
+ ST $f31, 1*SIZE($1)
+ ldi $2, -1($2)
+ ldi $1, 2*SIZE($1)
+ bgt $2, $L12
+ bgt $17,$L13
+ clr $0
+ ret
+ .align 4
+
+/* Main Routine */
+$Main:
+ sra $16, 1, $2 # $2 = (m >> 1)
+ mov $18, $1 # c_offset = c
+ ldi $17, -1($17) # n --
+ SXADDQ $19, $18, $18 # c += ldc
+ beq $2, $L18
+
+ LD $f14, 0*SIZE($1)
+ LD $f15, 1*SIZE($1)
+ LD $f24, 2*SIZE($1)
+ LD $f25, 3*SIZE($1)
+ ldi $2, -1($2) # $2 --
+ ble $2, $L19
+ .align 4
+
+
+$L23:
+ MUL $f19, $f14, $f10
+ fillcs 9*SIZE($1)
+ MUL $f20, $f15, $f11
+ ldi $2, -1($2)
+
+ MUL $f19, $f15, $f12
+ LD $f15, 5*SIZE($1)
+ MUL $f20, $f14, $f13
+ LD $f14, 4*SIZE($1)
+
+ MUL $f19, $f24, $f16
+ unop
+ MUL $f20, $f25, $f17
+ unop
+
+ MUL $f19, $f25, $f18
+ LD $f25, 7*SIZE($1)
+ SUB $f10, $f11, $f22
+ unop
+
+ MUL $f20, $f24, $f21
+ LD $f24, 6*SIZE($1)
+ ADD $f12, $f13, $f23
+ ldi $1, 4*SIZE($1)
+
+ SUB $f16, $f17, $f26
+ ADD $f18, $f21, $f27
+ ST $f22,-4*SIZE($1)
+ ST $f23,-3*SIZE($1)
+
+ ST $f26,-2*SIZE($1)
+ ST $f27,-1*SIZE($1)
+ unop
+ bgt $2,$L23
+ .align 4
+
+$L19:
+ MUL $f19, $f14, $f10
+ MUL $f20, $f15, $f11
+ MUL $f19, $f15, $f12
+ MUL $f20, $f14, $f13
+
+ MUL $f19, $f24, $f16
+ MUL $f20, $f25, $f17
+ MUL $f19, $f25, $f18
+ MUL $f20, $f24, $f21
+
+ SUB $f10, $f11, $f22
+ ADD $f12, $f13, $f23
+ SUB $f16, $f17, $f26
+ ADD $f18, $f21, $f27
+ ldi $1, 4*SIZE($1)
+
+ ST $f22, -4*SIZE($1)
+ ST $f23, -3*SIZE($1)
+ ST $f26, -2*SIZE($1)
+ ST $f27, -1*SIZE($1)
+
+ blbs $16, $L18
+ bgt $17, $Main
+ clr $0
+ ret
+ .align 4
+
+$L18:
+ LD $f14, 0*SIZE($1)
+ LD $f15, 1*SIZE($1)
+ MUL $f19, $f15, $f13
+ MUL $f20, $f14, $f10
+
+ MUL $f19, $f14, $f12
+ MUL $f20, $f15, $f11
+ ADD $f13, $f10, $f26
+ SUB $f12, $f11, $f27
+
+ ST $f26, 1*SIZE($1)
+ ST $f27, 0*SIZE($1)
+ ldi $1, 2*SIZE($1)
+ bgt $17, $Main
+ .align 4
+
+$End:
+ clr $0
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/zgemm_kernel_2x2.S b/kernel/sw_64/zgemm_kernel_2x2.S
new file mode 100644
index 0000000..6cf954b
--- /dev/null
+++ b/kernel/sw_64/zgemm_kernel_2x2.S
@@ -0,0 +1,1949 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 48
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+ .set noat
+ .set noreorder
+ .arch sw6a
+
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+
+#define STACKSIZE 88
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define BB $3
+#define OFFSET $4
+
+#define tmp $9
+
+#define ALPHA_R 64($sp)
+#define ALPHA_I 72($sp)
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 SUB
+#define ADD4 SUB
+#endif
+
+CNAME:
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $at, _mcount
+ jsr $at, ($at), _mcount
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+#ifdef TRMMKERNEL
+ ldl OFFSET, 24 + STACKSIZE($sp)
+#endif
+
+ sll LDC, ZBASE_SHIFT, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ fstd $f19, ALPHA_R
+ fstd $f20, ALPHA_I
+ stl tmp, 80($sp)
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ subl $31, OFFSET, KK
+#endif
+
+ sra N, 1, J
+ ble J, $L30
+ .align 4
+
+$L01:
+ mov C, C1
+ addl C, LDC, C2
+ mov A, AO
+ s4addl K, 0, BB
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ SXADDQ BB, B, BB
+ addl C2, LDC, C
+ unop
+
+ sra M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+
+ ble I, $L20
+ .align 4
+
+$L11:
+#ifndef EV4
+ fillcs 0 * SIZE(BB)
+ fillcs 8 * SIZE(BB)
+ unop
+ ldi BB, 16 * SIZE(BB)
+#endif
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c07
+
+ ldi BO, 4 * SIZE(B)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble L, $L15
+#else
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c07
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(TMP1)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble L, $L15
+#endif
+ .align 5
+
+$L12:
+/* 1 */
+ ADD1 c11, t1, a6
+ fmov a6, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD3 c12, t2, a6
+ fmov a6, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD2 c16, t3, a6
+ fmov a6, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD4 c15, t4, a6
+ fmov a6, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD2 c06, t3, a6
+ fmov a6, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, a6
+ fmov a6, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD1 c03, t1, a6
+ fmov a6, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD3 c04, t2, a6
+ fmov a6, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, a6
+ fmov a6, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, a6
+ fmov a6, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+ FIMOVD a6, tmp
+
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, a6
+ fmov a6, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD4 c07, t4, a6
+ fmov a6, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD1 c11, t1, a6
+ fmov a6, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD3 c12, t2, a6
+ fmov a6, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD2 c16, t3, a6
+ fmov a6, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD4 c15, t4, a6
+ fmov a6, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ IFMOVD tmp, a6
+ MUL b5, a6, t1
+ unop
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD2 c06, t3, a6
+ fmov a6, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, a6
+ fmov a6, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD1 c03, t1, a6
+ fmov a6, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD3 c04, t2, a6
+ fmov a6, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, a6
+ fmov a6, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD4 c13, t4, a6
+ fmov a6, c13
+ unop
+ IFMOVD tmp, a6
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ unop
+ IFMOVD tmp, a6
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD2 c14, t3, a6
+ fmov a6, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, a6
+ fmov a6, c07
+ IFMOVD tmp, a6
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD1 c11, t1, a6
+ fmov a6, c11
+ fldd alpha_r, ALPHA_R
+ FIMOVD alpha_r, tmp
+ MUL b1, a1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L18
+#else
+ blbs TMP1, $L18
+#endif
+ .align 4
+
+ ADD3 c12, t2, a6
+ fmov a6, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, a6
+ fmov a6, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, a6
+ fmov a6, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD2 c06, t3, a6
+ fmov a6, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, a6
+ fmov a6, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, a6
+ fmov a6, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c04, t2, a6
+ fmov a6, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, a6
+ fmov a6, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, a6
+ fmov a6, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, a6
+ fmov a6, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, a6
+ fmov a6, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD1 c11, t1, a6
+ fmov a6, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L18:
+ ADD3 c12, t2, a6
+ fmov a6, c12
+ unop
+ MUL b1, a2, t2
+ fldd alpha_i, ALPHA_I
+
+ ADD2 c16, t3, a6
+ fmov a6, c16
+ unop
+ MUL b2, a2, t3
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD4 c15, t4, a6
+ fmov a6, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL b1, a4, t2
+#ifndef TRMMKERNEL
+ LD b1, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD2 c06, t3, a6
+ fmov a6, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, a6
+ fmov a6, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, a6
+ fmov a6, c03
+ unop
+ MUL b3, a1, t1
+#ifndef TRMMKERNEL
+ LD a1, 2 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD3 c04, t2, a6
+ fmov a6, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, a6
+ fmov a6, c08
+ unop
+ MUL b4, a2, t3
+#ifndef TRMMKERNEL
+ LD a2, 3 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD4 c13, t4, a6
+ fmov a6, c13
+ unop
+ MUL b2, a3, t4
+#ifndef TRMMKERNEL
+ LD b2, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ ldi I, -1(I)
+ MUL b3, a3, t1
+ unop
+
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ unop
+ MUL b3, a4, t2
+#ifndef TRMMKERNEL
+ LD b3, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD2 c14, t3, a6
+ fmov a6, c14
+ unop
+ MUL b4, a4, t3
+#ifndef TRMMKERNEL
+ LD a4, 2 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD4 c07, t4, a6
+ fmov a6, c07
+ unop
+ MUL b4, a3, t4
+#ifndef TRMMKERNEL
+ LD a3, 3 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD1 c11, t1, a6
+ fmov a6, c11
+ ADD3 c12, t2, a6
+ fmov a6, c12
+ ADD2 c16, t3, a6
+ fmov a6, c16
+ ADD4 c15, t4, a6
+ fmov a6, c15
+
+ ADD c01, c06, a6
+ fmov a6, c01
+ ADD c02, c05, a6
+ fmov a6, c02
+ ADD c03, c08, a6
+ fmov a6, c03
+ ADD c04, c07, a6
+ fmov a6, c04
+
+ ADD c09, c14, a6
+ fmov a6, c09
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c01, t1
+ ADD c10, c13, a6
+ fmov a6, c10
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c02, t2
+
+ ADD c11, c16, a6
+ fmov a6, c11
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c03, t3
+ ADD c12, c15, a6
+ fmov a6, c12
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c04, t4
+
+#ifndef TRMMKERNEL
+ ADD a5, t1, a6
+ fmov a6, a5
+ MUL alpha_i, c02, t1
+ ADD b1, t2, a6
+ fmov a6, b1
+ MUL alpha_i, c01, t2
+
+ ADD a1, t3, a6
+ fmov a6, a1
+ MUL alpha_i, c04, t3
+ ADD a2, t4, a6
+ fmov a6, a2
+ MUL alpha_i, c03, t4
+#else
+ ADD $f31, t1, a5
+ MUL alpha_i, c02, t1
+ ADD $f31, t2, b1
+ MUL alpha_i, c01, t2
+
+ ADD $f31, t3, a1
+ MUL alpha_i, c04, t3
+ ADD $f31, t4, a2
+ MUL alpha_i, c03, t4
+#endif
+
+ SUB a5, t1, a6
+ fmov a6, a5
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c09, t1
+ ADD b1, t2, a6
+ fmov a6, b1
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c10, t2
+
+ SUB a1, t3, a6
+ fmov a6, a1
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c11, t3
+ ADD a2, t4, a6
+ fmov a6, a2
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c12, t4
+
+#ifndef TRMMKERNEL
+ ADD b2, t1, a6
+ fmov a6, b2
+ MUL alpha_i, c10, t1
+ ADD b3, t2, a6
+ fmov a6, b3
+ MUL alpha_i, c09, t2
+
+ ADD a4, t3, a6
+ fmov a6, a4
+ MUL alpha_i, c12, t3
+ ADD a3, t4, a6
+ fmov a6, a3
+ MUL alpha_i, c11, t4
+#else
+ ADD $f31, t1, b2
+ MUL alpha_i, c10, t1
+ ADD $f31, t2, b3
+ MUL alpha_i, c09, t2
+
+ ADD $f31, t3, a4
+ MUL alpha_i, c12, t3
+ ADD $f31, t4, a3
+ MUL alpha_i, c11, t4
+#endif
+
+ SUB b2, t1, a6
+ fmov a6, b2
+ ST a5, 0 * SIZE(C1)
+ fclr t1
+ unop
+
+ ADD b3, t2, a6
+ fmov a6, b3
+ ST b1, 1 * SIZE(C1)
+ fclr t2
+ unop
+
+ SUB a4, t3, a6
+ fmov a6, a4
+ ST a1, 2 * SIZE(C1)
+ fclr t3
+ unop
+
+ ADD a3, t4, a6
+ fmov a6, a3
+ ST a2, 3 * SIZE(C1)
+ fclr t4
+ unop
+
+ ST b2, 0 * SIZE(C2)
+ fclr c01
+ ST b3, 1 * SIZE(C2)
+ fclr c05
+
+ ST a4, 2 * SIZE(C2)
+ ldi C1, 4 * SIZE(C1)
+ ST a3, 3 * SIZE(C2)
+ ldi C2, 4 * SIZE(C2)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 1, I
+ ble I, $L29
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 4 * SIZE(B)
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ ble L, $L25
+#else
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 4 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+ ble L, $L25
+#endif
+ .align 5
+
+$L22:
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, a6
+ fmov a6, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD2 c14, t4, a6
+ fmov a6, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD2 c06, t4, a6
+ fmov a6, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD4 c13, t3, a6
+ fmov a6, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD2 c14, t4, a6
+ fmov a6, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, a6
+ fmov a6, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ fldd alpha_r, ALPHA_R
+ FIMOVD alpha_r, tmp
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L28
+#else
+ blbs TMP1, $L28
+#endif
+ .align 4
+
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, a6
+ fmov a6, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD2 c14, t4, a6
+ fmov a6, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, a6
+ fmov a6, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L28:
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ unop
+ MUL a2, b1, t2
+ fldd alpha_i, ALPHA_I
+
+ ADD4 c13, t3, a6
+ fmov a6, c13
+ unop
+ MUL a1, b2, t3
+#ifndef TRMMKERNEL
+ LD c03, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD2 c14, t4, a6
+ fmov a6, c14
+ unop
+ MUL a2, b2, t4
+#ifndef TRMMKERNEL
+ LD c04, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ MUL a1, b3, t1
+#ifndef TRMMKERNEL
+ LD c11, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a2, b3, t2
+#ifndef TRMMKERNEL
+ LD c12, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ MUL a1, b4, t3
+ ADD2 c06, t4, a6
+ fmov a6, c06
+ MUL a2, b4, t4
+
+ ADD1 c09, t1, a6
+ fmov a6, c09
+ ADD3 c10, t2, a6
+ fmov a6, c10
+ ADD4 c13, t3, a6
+ fmov a6, c13
+ ADD2 c14, t4, a6
+ fmov a6, c14
+
+ ADD c01, c06, a6
+ fmov a6, c01
+ ADD c02, c05, a6
+ fmov a6, c02
+ ADD c09, c14, a6
+ fmov a6, c09
+ ADD c10, c13, a6
+ fmov a6, c10
+
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c01, t1
+ MUL alpha_r, c02, t2
+ MUL alpha_r, c09, t3
+ MUL alpha_r, c10, t4
+
+#ifndef TRMMKERNEL
+ ADD c03, t1, a6
+ fmov a6, c03
+ MUL alpha_i, c02, t1
+ ADD c04, t2, a6
+ fmov a6, c04
+ MUL alpha_i, c01, t2
+
+ ADD c11, t3, a6
+ fmov a6, c11
+ MUL alpha_i, c10, t3
+ ADD c12, t4, a6
+ fmov a6, c12
+ MUL alpha_i, c09, t4
+#else
+ ADD $f31, t1, c03
+ MUL alpha_i, c02, t1
+ ADD $f31, t2, c04
+ MUL alpha_i, c01, t2
+
+ ADD $f31, t3, c11
+ MUL alpha_i, c10, t3
+ ADD $f31, t4, c12
+ MUL alpha_i, c09, t4
+#endif
+
+ SUB c03, t1, a6
+ fmov a6, c03
+ ADD c04, t2, a6
+ fmov a6, c04
+ SUB c11, t3, a6
+ fmov a6, c11
+ ADD c12, t4, a6
+ fmov a6, c12
+
+ ST c03, 0 * SIZE(C1)
+ ST c04, 1 * SIZE(C1)
+ ST c11, 0 * SIZE(C2)
+ ST c12, 1 * SIZE(C2)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 1, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 1, KK
+#endif
+ .align 4
+
+$L29:
+ mov BO, B
+ ldi J, -1(J)
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 2, KK
+#else
+ unop
+#endif
+ bgt J, $L01
+ .align 4
+
+$L30:
+ and N, 1, J
+ ble J, $L999
+
+ mov C, C1
+ mov A, AO
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ sra M, 1, I
+ ble I, $L50
+ .align 4
+
+$L41:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi BO, 2 * SIZE(B)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ fclr c04
+ fclr c08
+ ble L, $L45
+#else
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi BO, 2 * SIZE(BO)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(TMP1)
+ fclr c04
+ fclr c08
+ ble L, $L45
+#endif
+ .align 5
+
+$L42:
+ ADD4 c05, t1, a6
+ fmov a6, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD2 c06, t2, a6
+ fmov a6, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD4 c07, t3, a6
+ fmov a6, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD2 c08, t4, a6
+ fmov a6, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, a6
+ fmov a6, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, a6
+ fmov a6, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD4 c05, t1, a6
+ fmov a6, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD2 c06, t2, a6
+ fmov a6, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD4 c07, t3, a6
+ fmov a6, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD2 c08, t4, a6
+ fmov a6, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD1 c03, t3, a6
+ fmov a6, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c04, t4, a6
+ fmov a6, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L42
+ .align 4
+
+$L45:
+ ADD4 c05, t1, a6
+ fmov a6, c05
+ fldd alpha_r, ALPHA_R
+ FIMOVD alpha_r, tmp
+ MUL b1, a1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L48
+#else
+ blbs TMP1, $L48
+#endif
+ .align 4
+
+ ADD2 c06, t2, a6
+ fmov a6, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, a6
+ fmov a6, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, a6
+ fmov a6, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, a6
+ fmov a6, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, a6
+ fmov a6, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD4 c05, t1, a6
+ fmov a6, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L48:
+ ADD2 c06, t2, a6
+ fmov a6, c06
+ unop
+ MUL a2, b1, t2
+ fldd alpha_i, ALPHA_I
+
+ ADD4 c07, t3, a6
+ fmov a6, c07
+ ldi I, -1(I)
+ MUL a3, b1, t3
+#ifndef TRMMKERNEL
+ LD c09, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD2 c08, t4, a6
+ fmov a6, c08
+ unop
+ MUL a4, b1, t4
+#ifndef TRMMKERNEL
+ LD c10, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ MUL a1, b2, t1
+#ifndef TRMMKERNEL
+ LD c11, 2 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a2, b2, t2
+#ifndef TRMMKERNEL
+ LD c12, 3 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD1 c03, t3, a6
+ fmov a6, c03
+ MUL a3, b2, t3
+ ADD3 c04, t4, a6
+ fmov a6, c04
+ MUL a4, b2, t4
+
+ ADD4 c05, t1, a6
+ fmov a6, c05
+ ADD2 c06, t2, a6
+ fmov a6, c06
+ ADD4 c07, t3, a6
+ fmov a6, c07
+ ADD2 c08, t4, a6
+ fmov a6, c08
+
+ ADD c01, c06, a6
+ fmov a6, c01
+ ADD c02, c05, a6
+ fmov a6, c02
+ ADD c03, c08, a6
+ fmov a6, c03
+ ADD c04, c07, a6
+ fmov a6, c04
+
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c01, t1
+ MUL alpha_r, c02, t2
+ MUL alpha_r, c03, t3
+ MUL alpha_r, c04, t4
+
+#ifndef TRMMKERNEL
+ ADD c09, t1, a6
+ fmov a6, c09
+ MUL alpha_i, c02, t1
+ ADD c10, t2, a6
+ fmov a6, c10
+ MUL alpha_i, c01, t2
+
+ ADD c11, t3, a6
+ fmov a6, c11
+ MUL alpha_i, c04, t3
+ ADD c12, t4, a6
+ fmov a6, c12
+ MUL alpha_i, c03, t4
+#else
+ ADD $f31, t1, c09
+ MUL alpha_i, c02, t1
+ ADD $f31, t2, c10
+ MUL alpha_i, c01, t2
+
+ ADD $f31, t3, c11
+ MUL alpha_i, c04, t3
+ ADD $f31, t4, c12
+ MUL alpha_i, c03, t4
+#endif
+
+ SUB c09, t1, a6
+ fmov a6, c09
+ ADD c10, t2, a6
+ fmov a6, c10
+ SUB c11, t3, a6
+ fmov a6, c11
+ ADD c12, t4, a6
+ fmov a6, c12
+
+ ST c09, 0 * SIZE(C1)
+ ST c10, 1 * SIZE(C1)
+ ST c11, 2 * SIZE(C1)
+ ST c12, 3 * SIZE(C1)
+
+ ldi C1, 4 * SIZE(C1)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 1, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+
+ bgt I, $L41
+ .align 4
+
+$L50:
+ and M, 1, I
+ ble I, $L999
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(B)
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ ble L, $L55
+#else
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+ ble L, $L55
+#endif
+ .align 5
+
+$L52:
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, a6
+ fmov a6, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, a6
+ fmov a6, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ fldd alpha_r, ALPHA_R
+ FIMOVD alpha_r, tmp
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L58
+#else
+ blbs TMP1, $L58
+#endif
+ .align 4
+
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD2 c06, t4, a6
+ fmov a6, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L58:
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ unop
+ MUL a2, b1, t2
+ fldd alpha_i, ALPHA_I
+
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ unop
+ MUL a1, b2, t3
+#ifndef TRMMKERNEL
+ LD c03, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD2 c06, t4, a6
+ fmov a6, c06
+ unop
+ MUL a2, b2, t4
+#ifndef TRMMKERNEL
+ LD c04, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD1 c01, t1, a6
+ fmov a6, c01
+ ADD3 c02, t2, a6
+ fmov a6, c02
+ ADD4 c05, t3, a6
+ fmov a6, c05
+ ADD2 c06, t4, a6
+ fmov a6, c06
+
+ ADD c01, c06, a6
+ fmov a6, c01
+ ADD c02, c05, a6
+ fmov a6, c02
+
+ IFMOVD tmp, alpha_r
+ MUL alpha_r, c01, t1
+ MUL alpha_r, c02, t2
+ MUL alpha_i, c02, t3
+ MUL alpha_i, c01, t4
+
+#ifndef TRMMKERNEL
+ ADD c03, t1, a6
+ fmov a6, c03
+ ADD c04, t2, a6
+ fmov a6, c04
+#else
+ ADD $f31, t1, c03
+ ADD $f31, t2, c04
+#endif
+
+ SUB c03, t3, a6
+ fmov a6, c03
+ ADD c04, t4, a6
+ fmov a6, c04
+
+ ST c03, 0 * SIZE(C1)
+ ST c04, 1 * SIZE(C1)
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl $9, 80($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/zgemm_kernel_2x2.S.bak b/kernel/sw_64/zgemm_kernel_2x2.S.bak
new file mode 100644
index 0000000..2133673
--- /dev/null
+++ b/kernel/sw_64/zgemm_kernel_2x2.S.bak
@@ -0,0 +1,1704 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW2B
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+
+ .set noat
+ .set noreorder
+ .arch ev6
+
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define BB $3
+#define OFFSET $4
+
+#define ALPHA_R 64($sp)
+#define ALPHA_I 72($sp)
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 SUB
+#define ADD4 SUB
+#endif
+
+CNAME:
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $at, _mcount
+ jsr $at, ($at), _mcount
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+#ifdef TRMMKERNEL
+ ldl OFFSET, 24 + STACKSIZE($sp)
+#endif
+
+ sll LDC, ZBASE_SHIFT, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ fstd $f19, ALPHA_R
+ fstd $f20, ALPHA_I
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ subl $31, OFFSET, KK
+#endif
+
+ sra N, 1, J
+ ble J, $L30
+ .align 4
+
+$L01:
+ mov C, C1
+ addl C, LDC, C2
+ mov A, AO
+ s4addl K, 0, BB
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ SXADDQ BB, B, BB
+ addl C2, LDC, C
+ unop
+
+ sra M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+
+ ble I, $L20
+ .align 4
+
+$L11:
+#ifndef EV4
+ fillcs 0 * SIZE(BB)
+ fillcs 8 * SIZE(BB)
+ unop
+ ldi BB, 16 * SIZE(BB)
+#endif
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c07
+
+ ldi BO, 4 * SIZE(B)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble L, $L15
+#else
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c07
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(TMP1)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble L, $L15
+#endif
+ .align 5
+
+$L12:
+/* 1 */
+ ADD1 c11, t1, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD3 c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD4 c15, t4, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD1 c01, t1, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD3 c02, t2, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD2 c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD1 c11, t1, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD3 c12, t2, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD4 c15, t4, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD1 c01, t1, c01
+ unop
+ MUL b5, a6, t1
+ unop
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD2 c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD1 c03, t1, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD3 c04, t2, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD1 c11, t1, c11
+ fldd alpha_r, ALPHA_R
+ MUL b1, a1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L18
+#else
+ blbs TMP1, $L18
+#endif
+ .align 4
+
+ ADD3 c12, t2, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD2 c06, t3, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD1 c11, t1, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L18:
+ ADD3 c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ fldd alpha_i, ALPHA_I
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+#ifndef TRMMKERNEL
+ LD a5, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD4 c15, t4, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b1, a4, t2
+#ifndef TRMMKERNEL
+ LD b1, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD2 c06, t3, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+#ifndef TRMMKERNEL
+ LD a1, 2 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+#ifndef TRMMKERNEL
+ LD a2, 3 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+#ifndef TRMMKERNEL
+ LD b2, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD1 c09, t1, c09
+ ldi I, -1(I)
+ MUL b3, a3, t1
+ unop
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+#ifndef TRMMKERNEL
+ LD b3, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+#ifndef TRMMKERNEL
+ LD a4, 2 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+#ifndef TRMMKERNEL
+ LD a3, 3 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD1 c11, t1, c11
+ ADD3 c12, t2, c12
+ ADD2 c16, t3, c16
+ ADD4 c15, t4, c15
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c03, c08, c03
+ ADD c04, c07, c04
+
+ ADD c09, c14, c09
+ MUL alpha_r, c01, t1
+ ADD c10, c13, c10
+ MUL alpha_r, c02, t2
+
+ ADD c11, c16, c11
+ MUL alpha_r, c03, t3
+ ADD c12, c15, c12
+ MUL alpha_r, c04, t4
+
+#ifndef TRMMKERNEL
+ ADD a5, t1, a5
+ MUL alpha_i, c02, t1
+ ADD b1, t2, b1
+ MUL alpha_i, c01, t2
+
+ ADD a1, t3, a1
+ MUL alpha_i, c04, t3
+ ADD a2, t4, a2
+ MUL alpha_i, c03, t4
+#else
+ ADD $f31, t1, a5
+ MUL alpha_i, c02, t1
+ ADD $f31, t2, b1
+ MUL alpha_i, c01, t2
+
+ ADD $f31, t3, a1
+ MUL alpha_i, c04, t3
+ ADD $f31, t4, a2
+ MUL alpha_i, c03, t4
+#endif
+
+ SUB a5, t1, a5
+ MUL alpha_r, c09, t1
+ ADD b1, t2, b1
+ MUL alpha_r, c10, t2
+
+ SUB a1, t3, a1
+ MUL alpha_r, c11, t3
+ ADD a2, t4, a2
+ MUL alpha_r, c12, t4
+
+#ifndef TRMMKERNEL
+ ADD b2, t1, b2
+ MUL alpha_i, c10, t1
+ ADD b3, t2, b3
+ MUL alpha_i, c09, t2
+
+ ADD a4, t3, a4
+ MUL alpha_i, c12, t3
+ ADD a3, t4, a3
+ MUL alpha_i, c11, t4
+#else
+ ADD $f31, t1, b2
+ MUL alpha_i, c10, t1
+ ADD $f31, t2, b3
+ MUL alpha_i, c09, t2
+
+ ADD $f31, t3, a4
+ MUL alpha_i, c12, t3
+ ADD $f31, t4, a3
+ MUL alpha_i, c11, t4
+#endif
+
+ SUB b2, t1, b2
+ ST a5, 0 * SIZE(C1)
+ fclr t1
+ unop
+
+ ADD b3, t2, b3
+ ST b1, 1 * SIZE(C1)
+ fclr t2
+ unop
+
+ SUB a4, t3, a4
+ ST a1, 2 * SIZE(C1)
+ fclr t3
+ unop
+
+ ADD a3, t4, a3
+ ST a2, 3 * SIZE(C1)
+ fclr t4
+ unop
+
+ ST b2, 0 * SIZE(C2)
+ fclr c01
+ ST b3, 1 * SIZE(C2)
+ fclr c05
+
+ ST a4, 2 * SIZE(C2)
+ ldi C1, 4 * SIZE(C1)
+ ST a3, 3 * SIZE(C2)
+ ldi C2, 4 * SIZE(C2)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 1, I
+ ble I, $L29
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 2, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 4 * SIZE(B)
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ ble L, $L25
+#else
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 4 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+ ble L, $L25
+#endif
+ .align 5
+
+$L22:
+ ADD1 c09, t1, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD2 c14, t4, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD1 c09, t1, c09
+ fldd alpha_r, ALPHA_R
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L28
+#else
+ blbs TMP1, $L28
+#endif
+ .align 4
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c09, t1, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L28:
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ fldd alpha_i, ALPHA_I
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+#ifndef TRMMKERNEL
+ LD c03, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+#ifndef TRMMKERNEL
+ LD c04, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+#ifndef TRMMKERNEL
+ LD c11, 0 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+#ifndef TRMMKERNEL
+ LD c12, 1 * SIZE(C2)
+#else
+ unop
+#endif
+
+ ADD4 c05, t3, c05
+ MUL a1, b4, t3
+ ADD2 c06, t4, c06
+ MUL a2, b4, t4
+
+ ADD1 c09, t1, c09
+ ADD3 c10, t2, c10
+ ADD4 c13, t3, c13
+ ADD2 c14, t4, c14
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c09, c14, c09
+ ADD c10, c13, c10
+
+ MUL alpha_r, c01, t1
+ MUL alpha_r, c02, t2
+ MUL alpha_r, c09, t3
+ MUL alpha_r, c10, t4
+
+#ifndef TRMMKERNEL
+ ADD c03, t1, c03
+ MUL alpha_i, c02, t1
+ ADD c04, t2, c04
+ MUL alpha_i, c01, t2
+
+ ADD c11, t3, c11
+ MUL alpha_i, c10, t3
+ ADD c12, t4, c12
+ MUL alpha_i, c09, t4
+#else
+ ADD $f31, t1, c03
+ MUL alpha_i, c02, t1
+ ADD $f31, t2, c04
+ MUL alpha_i, c01, t2
+
+ ADD $f31, t3, c11
+ MUL alpha_i, c10, t3
+ ADD $f31, t4, c12
+ MUL alpha_i, c09, t4
+#endif
+
+ SUB c03, t1, c03
+ ADD c04, t2, c04
+ SUB c11, t3, c11
+ ADD c12, t4, c12
+
+ ST c03, 0 * SIZE(C1)
+ ST c04, 1 * SIZE(C1)
+ ST c11, 0 * SIZE(C2)
+ ST c12, 1 * SIZE(C2)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 1, TMP1
+#else
+ subl TMP1, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 1, KK
+#endif
+ .align 4
+
+$L29:
+ mov BO, B
+ ldi J, -1(J)
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 2, KK
+#else
+ unop
+#endif
+ bgt J, $L01
+ .align 4
+
+$L30:
+ and N, 1, J
+ ble J, $L999
+
+ mov C, C1
+ mov A, AO
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ sra M, 1, I
+ ble I, $L50
+ .align 4
+
+$L41:
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 2, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi BO, 2 * SIZE(B)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ fclr c04
+ fclr c08
+ ble L, $L45
+#else
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi BO, 2 * SIZE(BO)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(TMP1)
+ fclr c04
+ fclr c08
+ ble L, $L45
+#endif
+ .align 5
+
+$L42:
+ ADD4 c05, t1, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD2 c06, t2, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD4 c07, t3, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD4 c05, t1, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD2 c06, t2, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD4 c07, t3, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L42
+ .align 4
+
+$L45:
+ ADD4 c05, t1, c05
+ fldd alpha_r, ALPHA_R
+ MUL b1, a1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L48
+#else
+ blbs TMP1, $L48
+#endif
+ .align 4
+
+ ADD2 c06, t2, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD4 c05, t1, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L48:
+ ADD2 c06, t2, c06
+ unop
+ MUL a2, b1, t2
+ fldd alpha_i, ALPHA_I
+
+ ADD4 c07, t3, c07
+ ldi I, -1(I)
+ MUL a3, b1, t3
+#ifndef TRMMKERNEL
+ LD c09, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+#ifndef TRMMKERNEL
+ LD c10, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+#ifndef TRMMKERNEL
+ LD c11, 2 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b2, t2
+#ifndef TRMMKERNEL
+ LD c12, 3 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD1 c03, t3, c03
+ MUL a3, b2, t3
+ ADD3 c04, t4, c04
+ MUL a4, b2, t4
+
+ ADD4 c05, t1, c05
+ ADD2 c06, t2, c06
+ ADD4 c07, t3, c07
+ ADD2 c08, t4, c08
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c03, c08, c03
+ ADD c04, c07, c04
+
+ MUL alpha_r, c01, t1
+ MUL alpha_r, c02, t2
+ MUL alpha_r, c03, t3
+ MUL alpha_r, c04, t4
+
+#ifndef TRMMKERNEL
+ ADD c09, t1, c09
+ MUL alpha_i, c02, t1
+ ADD c10, t2, c10
+ MUL alpha_i, c01, t2
+
+ ADD c11, t3, c11
+ MUL alpha_i, c04, t3
+ ADD c12, t4, c12
+ MUL alpha_i, c03, t4
+#else
+ ADD $f31, t1, c09
+ MUL alpha_i, c02, t1
+ ADD $f31, t2, c10
+ MUL alpha_i, c01, t2
+
+ ADD $f31, t3, c11
+ MUL alpha_i, c04, t3
+ ADD $f31, t4, c12
+ MUL alpha_i, c03, t4
+#endif
+
+ SUB c09, t1, c09
+ ADD c10, t2, c10
+ SUB c11, t3, c11
+ ADD c12, t4, c12
+
+ ST c09, 0 * SIZE(C1)
+ ST c10, 1 * SIZE(C1)
+ ST c11, 2 * SIZE(C1)
+ ST c12, 3 * SIZE(C1)
+
+ ldi C1, 4 * SIZE(C1)
+
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TMP1
+#ifdef LEFT
+ subl TMP1, 2, TMP1
+#else
+ subl TMP1, 1, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ addl KK, 2, KK
+#endif
+
+ bgt I, $L41
+ .align 4
+
+$L50:
+ and M, 1, I
+ ble I, $L999
+
+#if !defined(TRMMKERNEL) || \
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+
+#ifdef TRMMKERNEL
+#ifdef LEFT
+ addl KK, 1, TMP1
+#else
+ addl KK, 1, TMP1
+#endif
+#endif
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(B)
+
+#ifndef TRMMKERNEL
+ ldi L, -2(K)
+#else
+ ldi L, -2(TMP1)
+#endif
+ ble L, $L55
+#else
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AO, TMP1, AO
+ addl B, TMP1, BO
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+ ble L, $L55
+#endif
+ .align 5
+
+$L52:
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c02, t2, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c01, t1, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD1 c01, t1, c01
+ fldd alpha_r, ALPHA_R
+ MUL a1, b1, t1
+#ifndef TRMMKERNEL
+ blbs K, $L58
+#else
+ blbs TMP1, $L58
+#endif
+ .align 4
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c01, t1, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L58:
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ fldd alpha_i, ALPHA_I
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b2, t3
+#ifndef TRMMKERNEL
+ LD c03, 0 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+#ifndef TRMMKERNEL
+ LD c04, 1 * SIZE(C1)
+#else
+ unop
+#endif
+
+ ADD1 c01, t1, c01
+ ADD3 c02, t2, c02
+ ADD4 c05, t3, c05
+ ADD2 c06, t4, c06
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+
+ MUL alpha_r, c01, t1
+ MUL alpha_r, c02, t2
+ MUL alpha_i, c02, t3
+ MUL alpha_i, c01, t4
+
+#ifndef TRMMKERNEL
+ ADD c03, t1, c03
+ ADD c04, t2, c04
+#else
+ ADD $f31, t1, c03
+ ADD $f31, t2, c04
+#endif
+
+ SUB c03, t3, c03
+ ADD c04, t4, c04
+
+ ST c03, 0 * SIZE(C1)
+ ST c04, 1 * SIZE(C1)
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/zgemm_kernel_simd_8x2.S b/kernel/sw_64/zgemm_kernel_simd_8x2.S
new file mode 100644
index 0000000..f6a36fb
--- /dev/null
+++ b/kernel/sw_64/zgemm_kernel_simd_8x2.S
@@ -0,0 +1,3189 @@
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#define STACKSIZE 128
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define PREA $10
+#define PREB $11
+
+#define AO $9
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define a5 $f16
+#define a6 $f24
+#define a7 $f25
+#define a8 $f26
+
+#define b5 $f27
+#define b6 $f28
+#define b7 $f29
+#define b8 $f30
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TEMP $1
+#define KK $2
+#define BB $3
+#define OFFSET $4
+
+#define ALPHA_R 64($sp)
+#define ALPHA_I 72($sp)
+
+/*
+ *===================
+ * (a+bi)*(c+di)
+ * ADD1 ac '+' bd
+ * ADD2 ad '+' bc
+ * FMAD5 a*alpha_r + real part
+ * FMAD6 a*alpha_i + image part
+ * FMAD7 b*alpha_r + image part
+ * FMAD8 b*alpha_i + real part
+
+ *===================
+ */
+
+/*
+ *===================
+ * (a+bi) * (c+di)
+ * (a+bi) * (alpha_r+alpha_i)
+ *===================
+ */
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define ADD1 SUB
+#define ADD2 ADD
+#define FMAD5 MAD
+#define FMAD6 MAD
+#define FMAD7 MAD
+#define FMAD8 NMAD
+#endif
+
+/*
+ *===================
+ * (a-bi) * (c+di)
+ * (a+bi) * (alpha_r+alpha_i)
+ *===================
+ */
+
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define ADD1 ADD
+#define ADD2 SUB
+#define FMAD5 MAD
+#define FMAD6 MAD
+#define FMAD7 MAD
+#define FMAD8 NMAD
+#endif
+
+/*
+ *===================
+ * (a+bi) * (c-di)
+ * (a-bi) * (alpha_r+alpha_i)
+ *===================
+ */
+
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define ADD1 ADD
+#define ADD2 SUB
+#define FMAD5 MAD
+#define FMAD6 MAD
+#define FMAD7 NMAD
+#define FMAD8 MAD
+#endif
+
+/*
+ *===================
+ * (a-bi) * (c-di)
+ * (a-bi) * (alpha_r+alpha_i)
+ *===================
+ */
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define ADD1 SUB
+#define ADD2 ADD
+#define FMAD5 MAD
+#define FMAD6 MAD
+#define FMAD7 NMAD
+#define FMAD8 MAD
+#endif
+
+
+
+ PROLOGUE
+ PROFCODE
+
+ .frame $30, STACKSIZE, $26, 0
+ ldi $sp, -STACKSIZE($sp)
+
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+#ifdef TRMMKERNEL
+ ldl OFFSET, 24 + STACKSIZE($sp)
+#endif
+
+ sll LDC, ZBASE_SHIFT, LDC # LDC*sizebyte
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ ST $f19, ALPHA_R
+ ST $f20, ALPHA_I
+
+ stl $9, 80($sp) # Integer Saved Register
+ stl $10,88($sp)
+ stl $11,96($sp)
+ stl $12,104($sp)
+ stl $13,112($sp)
+ stl $14,120($sp)
+
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ subl $31, OFFSET, KK
+#endif
+
+ sra N, 1, J # J=N/2
+ ble J, $L50
+ .align 4
+
+$L01:
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK
+#endif
+
+ sra M, 3, I # I=M/8
+ sll K, ZBASE_SHIFT, PREB
+
+ sll K, 2+ZBASE_SHIFT, PREA
+ mov C, C1
+
+ addl C, LDC, C2
+ mov A, AO # Reset A
+
+ addl PREB, B, PREB
+ addl C2, LDC, C # Change C to next panel
+
+ addl PREA, A, PREA
+ beq I, $L20 # GEMM_MR=8
+
+$L11:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B, BO # LL && RU reset B
+ nop
+#else
+ sll KK, 3 + ZBASE_SHIFT, L # KK*8mr
+ sll KK, 1 + ZBASE_SHIFT, TEMP # KK*2nr
+
+ addl AO, L, AO # mov AO point to the data part
+ addl B,TEMP,BO # mov BO point to the data part
+#endif
+
+ vcpys $f31,$f31,c01 # Clear result regs
+ fillcs 0(C1)
+ fillcs 4*SIZE(C1)
+
+ vcpys $f31,$f31,c02
+ fillcs 8*SIZE(C1)
+ fillcs 12*SIZE(C1)
+
+ vcpys $f31,$f31,c03
+ fillcs 0(C2)
+ fillcs 4*SIZE(C2)
+
+ vcpys $f31,$f31,c04
+ fillcs 8*SIZE(C2)
+ fillcs 12*SIZE(C2)
+
+ vcpys $f31,$f31,c05
+ vcpys $f31,$f31,c06
+ vcpys $f31,$f31,c07
+ vcpys $f31,$f31,c08
+
+ vcpys $f31,$f31,c09
+ LDDE b1, 0 * SIZE(BO) # B1R
+ LDDE b2, 1 * SIZE(BO) # B1I
+
+ vcpys $f31,$f31,c10
+ VLD a1, 0 * SIZE(AO) # A1, A2
+ VLD a2, 4 * SIZE(AO) # A3, A4
+
+ vcpys $f31,$f31,c11
+ LDDE b3, 2 * SIZE(BO) # B2R
+ LDDE b4, 3 * SIZE(BO) # B2I
+
+ vcpys $f31,$f31,c12
+ VLD a3, 8 * SIZE(AO) # A5, A6
+ VLD a4,12 * SIZE(AO) # A7, A8
+
+ vcpys $f31,$f31,c13
+ vcpys $f31,$f31,c14
+ vcpys $f31,$f31,c15
+ vcpys $f31,$f31,c16
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) \
+ ||(!defined(LEFT) && defined(TRANSA))
+ subl K, KK, TEMP # temp is the length of data part
+#elif defined(LEFT)
+ addl KK, 8, TEMP # mr=8, careful about complex
+#else
+ addl KK, 2, TEMP # nr=2
+#endif
+ sra TEMP, 1, L # L=TEMP/2
+ ble L, $L15
+
+#else
+ vcpys $f31,$f31,c01 # Clear result regs
+ mov B, BO # Set B, (block A x panel Bj)
+ sra K, 1, L # Unroll K as 2
+
+ vcpys $f31,$f31,c02
+ fillcs 0(C1)
+ fillcs 4*SIZE(C1)
+
+ vcpys $f31,$f31,c03
+ fillcs 8*SIZE(C1)
+ fillcs 12*SIZE(C1)
+
+ vcpys $f31,$f31,c04
+ fillcs 0(C2)
+ fillcs 4*SIZE(C2)
+
+ vcpys $f31,$f31,c05
+ fillcs 8*SIZE(C2)
+ fillcs 12*SIZE(C2)
+
+ vcpys $f31,$f31,c06
+ vcpys $f31,$f31,c07
+ vcpys $f31,$f31,c08
+ vcpys $f31,$f31,c09
+
+ vcpys $f31,$f31,c10
+ LDDE b1, 0 * SIZE(BO) # B1R
+ LDDE b2, 1 * SIZE(BO) # B1I
+
+ vcpys $f31,$f31,c11
+ VLD a1, 0 * SIZE(AO) # A1, A2
+ VLD a2, 4 * SIZE(AO) # A3, A4
+
+ vcpys $f31,$f31,c12
+ LDDE b3, 2 * SIZE(BO) # B2R
+ LDDE b4, 3 * SIZE(BO) # B2I
+
+ vcpys $f31,$f31,c13
+ VLD a3, 8 * SIZE(AO) # A5, A6
+ VLD a4,12 * SIZE(AO) # A7, A8
+
+ vcpys $f31,$f31,c14
+ vcpys $f31,$f31,c15
+
+ vcpys $f31,$f31,c16
+ ble L, $L15
+#endif
+
+ .align 4
+$L12:
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc)
+ LDDE b5, 4 * SIZE(BO) # next B1R
+
+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd)
+ LDDE b6, 5 * SIZE(BO) # next B1I
+
+ VMAD a2,b1,c05,c05 # C31, C41
+ VLD a8,12 * SIZE(AO) # next A7, A8
+
+ VMAD a2,b2,c06,c06 # C31, C41
+ VLD a7, 8 * SIZE(AO) # next A5, A6
+
+ VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc)
+ VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd)
+ VMAD a3,b1,c09,c09 # C51, C61
+ VMAD a3,b2,c10,c10 # C51, C61
+
+
+ VMAD a2,b3,c07,c07 # C32, C42
+ LDDE b7, 6 * SIZE(BO) # next B2R
+
+ VMAD a2,b4,c08,c08 # C32, C42
+ LDDE b8, 7 * SIZE(BO) # next B2I
+
+ VMAD a4,b1,c13,c13 # C71, C81
+ VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0
+
+ VMAD a4,b2,c14,c14 # C71, C81
+ VLD a6, 4 * SIZE(AO) # next A3, A4
+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
+
+
+ VMAD a3,b3,c11,c11 # C52, C62
+ fillcs 0(PREB)
+
+ VMAD a3,b4,c12,c12 # C52, C62
+ fillcs 0(PREA)
+
+ VMAD a4,b3,c15,c15 # C72, C82
+ fillcs 8*SIZE(PREA)
+
+ VMAD a4,b4,c16,c16 # C72, C82
+ subl L, 1, L #
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
+
+ VMAD a8,b5,c13,c13
+ LDDE b1, 0 * SIZE(BO)
+
+ VMAD a8,b6,c14,c14
+ LDDE b2, 1 * SIZE(BO)
+
+ VMAD a7,b5,c09,c09
+ addl PREA, 16*SIZE, PREA
+ VLD a4,12 * SIZE(AO)
+
+ VMAD a7,b6,c10,c10
+ VLD a3, 8 * SIZE(AO)
+
+ VMAD a5,b5,c01,c01
+ VMAD a5,b6,c02,c02
+ VMAD a5,b7,c03,c03
+ VMAD a5,b8,c04,c04
+
+ VMAD a8,b7,c15,c15
+ LDDE b3, 2 * SIZE(BO)
+
+ VMAD a8,b8,c16,c16
+ LDDE b4, 3 * SIZE(BO)
+
+ VMAD a6,b5,c05,c05
+ VLD a1, 0 * SIZE(AO)
+
+ VMAD a6,b6,c06,c06
+ VLD a2, 4 * SIZE(AO)
+
+
+ VMAD a7,b7,c11,c11
+ fillcs 4*SIZE(PREB)
+
+ VMAD a7,b8,c12,c12
+ fillcs 0(PREA)
+
+ VMAD a6,b7,c07,c07
+ addl PREB, 8*SIZE, PREB
+ fillcs 8*SIZE(PREA)
+
+ VMAD a6,b8,c08,c08
+ addl PREA, 16*SIZE, PREA
+ bne L, $L12 # continue K
+
+$L15:
+ LD alpha_r, ALPHA_R # $f30==b8
+#ifndef TRMMKERNEL
+ blbc K, $L18 # if(K&1)
+#else
+ blbc TEMP, $L18
+#endif
+
+$L16:
+ VMAD a1,b1,c01,c01 # C11R C21R
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
+
+ VMAD a1,b2,c02,c02 # C11I C21I
+ addl BO, 4*SIZE, BO
+
+ VMAD a1,b3,c03,c03 # C12R c22R
+ VMAD a1,b4,c04,c04 # C12I C22I
+
+ VMAD a2,b1,c05,c05 # C31R C41R
+ VMAD a2,b2,c06,c06 # C31I C41I
+ VMAD a2,b3,c07,c07 # C32R C42R
+ VMAD a2,b4,c08,c08 # C32I C42I
+
+ VMAD a3,b1,c09,c09 # C51R C61R
+ VMAD a3,b2,c10,c10 # C51I C61I
+ VMAD a3,b3,c11,c11 # C52R C62R
+ VMAD a3,b4,c12,c12 # C52I C62I
+
+ VMAD a4,b1,c13,c13 # C71R C81R
+ VMAD a4,b2,c14,c14 # C71I C81I
+ VMAD a4,b3,c15,c15 # C72R C82R
+ VMAD a4,b4,c16,c16 # C72I C82I
+
+$L18: # Write back
+ LD alpha_i, ALPHA_I # $f29==b7
+#ifndef TRMMKERNEL
+ vextf c01, 0, a1 # a1=C11R_ac
+ vextf c01, 1, a2 # a2=C11I_bc
+ vextf c01, 2, a3 # a3=C21R_ac
+ vextf c01, 3, a4 # a4=C21I_bc
+
+ vextf c02, 0, b1 # b1=C11I_ad
+ vextf c02, 1, b2 # b2=C11R_bd
+ vextf c02, 2, b3 # b3=C21I_ad
+ vextf c02, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 0 * SIZE(C1)
+ LD a2, 1 * SIZE(C1)
+ LD a3, 2 * SIZE(C1)
+ LD a4, 3 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 0 * SIZE(C1)
+ ST c01, 1 * SIZE(C1)
+ ST b6, 2 * SIZE(C1)
+ ST c02, 3 * SIZE(C1)
+
+ vextf c05, 0, a1 # a1=C11R_ac
+ vextf c05, 1, a2 # a2=C11I_bc
+ vextf c05, 2, a3 # a3=C21R_ac
+ vextf c05, 3, a4 # a4=C21I_bc
+
+ vextf c06, 0, b1 # b1=C11I_ad
+ vextf c06, 1, b2 # b2=C11R_bd
+ vextf c06, 2, b3 # b3=C21I_ad
+ vextf c06, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 4 * SIZE(C1)
+ LD a2, 5 * SIZE(C1)
+ LD a3, 6 * SIZE(C1)
+ LD a4, 7 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 4 * SIZE(C1)
+ ST c01, 5 * SIZE(C1)
+ ST b6, 6 * SIZE(C1)
+ ST c02, 7 * SIZE(C1)
+
+ vextf c09, 0, a1 # a1=C11R_ac
+ vextf c09, 1, a2 # a2=C11I_bc
+ vextf c09, 2, a3 # a3=C21R_ac
+ vextf c09, 3, a4 # a4=C21I_bc
+
+ vextf c10, 0, b1 # b1=C11I_ad
+ vextf c10, 1, b2 # b2=C11R_bd
+ vextf c10, 2, b3 # b3=C21I_ad
+ vextf c10, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 8 * SIZE(C1)
+ LD a2, 9 * SIZE(C1)
+ LD a3, 10 * SIZE(C1)
+ LD a4, 11 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 8 * SIZE(C1)
+ ST c01, 9 * SIZE(C1)
+ ST b6, 10 * SIZE(C1)
+ ST c02, 11 * SIZE(C1)
+
+ vextf c13, 0, a1 # a1=C11R_ac
+ vextf c13, 1, a2 # a2=C11I_bc
+ vextf c13, 2, a3 # a3=C21R_ac
+ vextf c13, 3, a4 # a4=C21I_bc
+
+ vextf c14, 0, b1 # b1=C11I_ad
+ vextf c14, 1, b2 # b2=C11R_bd
+ vextf c14, 2, b3 # b3=C21I_ad
+ vextf c14, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 12 * SIZE(C1)
+ LD a2, 13 * SIZE(C1)
+ LD a3, 14 * SIZE(C1)
+ LD a4, 15 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 12 * SIZE(C1)
+ ST c01, 13 * SIZE(C1)
+ ST b6, 14 * SIZE(C1)
+ ST c02, 15 * SIZE(C1)
+
+
+ vextf c03, 0, a1 # a1=C11R_ac
+ vextf c03, 1, a2 # a2=C11I_bc
+ vextf c03, 2, a3 # a3=C21R_ac
+ vextf c03, 3, a4 # a4=C21I_bc
+
+ vextf c04, 0, b1 # b1=C11I_ad
+ vextf c04, 1, b2 # b2=C11R_bd
+ vextf c04, 2, b3 # b3=C21I_ad
+ vextf c04, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 0 * SIZE(C2)
+ LD a2, 1 * SIZE(C2)
+ LD a3, 2 * SIZE(C2)
+ LD a4, 3 * SIZE(C2)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 0 * SIZE(C2)
+ ST c02, 1 * SIZE(C2)
+ ST c05, 2 * SIZE(C2)
+ ST c06, 3 * SIZE(C2)
+
+ vextf c07, 0, a1 # a1=C11R_ac
+ vextf c07, 1, a2 # a2=C11I_bc
+ vextf c07, 2, a3 # a3=C21R_ac
+ vextf c07, 3, a4 # a4=C21I_bc
+
+ vextf c08, 0, b1 # b1=C11I_ad
+ vextf c08, 1, b2 # b2=C11R_bd
+ vextf c08, 2, b3 # b3=C21I_ad
+ vextf c08, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 4 * SIZE(C2)
+ LD a2, 5 * SIZE(C2)
+ LD a3, 6 * SIZE(C2)
+ LD a4, 7 * SIZE(C2)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 4 * SIZE(C2)
+ ST c02, 5 * SIZE(C2)
+ ST c05, 6 * SIZE(C2)
+ ST c06, 7 * SIZE(C2)
+
+ vextf c11, 0, a1 # a1=C11R_ac
+ vextf c11, 1, a2 # a2=C11I_bc
+ vextf c11, 2, a3 # a3=C21R_ac
+ vextf c11, 3, a4 # a4=C21I_bc
+
+ vextf c12, 0, b1 # b1=C11I_ad
+ vextf c12, 1, b2 # b2=C11R_bd
+ vextf c12, 2, b3 # b3=C21I_ad
+ vextf c12, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 8 * SIZE(C2)
+ LD a2, 9 * SIZE(C2)
+ LD a3, 10 * SIZE(C2)
+ LD a4, 11 * SIZE(C2)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 8 * SIZE(C2)
+ ST c02, 9 * SIZE(C2)
+ ST c05, 10 * SIZE(C2)
+ ST c06, 11 * SIZE(C2)
+
+ vextf c15, 0, a1 # a1=C11R_ac
+ vextf c15, 1, a2 # a2=C11I_bc
+ vextf c15, 2, a3 # a3=C21R_ac
+ vextf c15, 3, a4 # a4=C21I_bc
+
+ vextf c16, 0, b1 # b1=C11I_ad
+ vextf c16, 1, b2 # b2=C11R_bd
+ vextf c16, 2, b3 # b3=C21I_ad
+ vextf c16, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 12 * SIZE(C2)
+ LD a2, 13 * SIZE(C2)
+ LD a3, 14 * SIZE(C2)
+ LD a4, 15 * SIZE(C2)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 12 * SIZE(C2)
+ ST c02, 13 * SIZE(C2)
+ ST c05, 14 * SIZE(C2)
+ ST c06, 15 * SIZE(C2)
+
+#else
+ vextf c01, 0, a1 # a1=C11R_ac
+ vextf c01, 1, a2 # a2=C11I_bc
+ vextf c01, 2, a3 # a3=C21R_ac
+ vextf c01, 3, a4 # a4=C21I_bc
+
+ vextf c02, 0, b1 # b1=C11I_ad
+ vextf c02, 1, b2 # b2=C11R_bd
+ vextf c02, 2, b3 # b3=C21I_ad
+ vextf c02, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 0 * SIZE(C1)
+ ST c01, 1 * SIZE(C1)
+ ST b6, 2 * SIZE(C1)
+ ST c02, 3 * SIZE(C1)
+
+ vextf c05, 0, a1 # a1=C11R_ac
+ vextf c05, 1, a2 # a2=C11I_bc
+ vextf c05, 2, a3 # a3=C21R_ac
+ vextf c05, 3, a4 # a4=C21I_bc
+
+ vextf c06, 0, b1 # b1=C11I_ad
+ vextf c06, 1, b2 # b2=C11R_bd
+ vextf c06, 2, b3 # b3=C21I_ad
+ vextf c06, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 4 * SIZE(C1)
+ ST c01, 5 * SIZE(C1)
+ ST b6, 6 * SIZE(C1)
+ ST c02, 7 * SIZE(C1)
+
+ vextf c09, 0, a1 # a1=C11R_ac
+ vextf c09, 1, a2 # a2=C11I_bc
+ vextf c09, 2, a3 # a3=C21R_ac
+ vextf c09, 3, a4 # a4=C21I_bc
+
+ vextf c10, 0, b1 # b1=C11I_ad
+ vextf c10, 1, b2 # b2=C11R_bd
+ vextf c10, 2, b3 # b3=C21I_ad
+ vextf c10, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 8 * SIZE(C1)
+ ST c01, 9 * SIZE(C1)
+ ST b6, 10 * SIZE(C1)
+ ST c02, 11 * SIZE(C1)
+
+ vextf c13, 0, a1 # a1=C11R_ac
+ vextf c13, 1, a2 # a2=C11I_bc
+ vextf c13, 2, a3 # a3=C21R_ac
+ vextf c13, 3, a4 # a4=C21I_bc
+
+ vextf c14, 0, b1 # b1=C11I_ad
+ vextf c14, 1, b2 # b2=C11R_bd
+ vextf c14, 2, b3 # b3=C21I_ad
+ vextf c14, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 12 * SIZE(C1)
+ ST c01, 13 * SIZE(C1)
+ ST b6, 14 * SIZE(C1)
+ ST c02, 15 * SIZE(C1)
+
+
+ vextf c03, 0, a1 # a1=C11R_ac
+ vextf c03, 1, a2 # a2=C11I_bc
+ vextf c03, 2, a3 # a3=C21R_ac
+ vextf c03, 3, a4 # a4=C21I_bc
+
+ vextf c04, 0, b1 # b1=C11I_ad
+ vextf c04, 1, b2 # b2=C11R_bd
+ vextf c04, 2, b3 # b3=C21I_ad
+ vextf c04, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 0 * SIZE(C2)
+ ST c02, 1 * SIZE(C2)
+ ST c05, 2 * SIZE(C2)
+ ST c06, 3 * SIZE(C2)
+
+ vextf c07, 0, a1 # a1=C11R_ac
+ vextf c07, 1, a2 # a2=C11I_bc
+ vextf c07, 2, a3 # a3=C21R_ac
+ vextf c07, 3, a4 # a4=C21I_bc
+
+ vextf c08, 0, b1 # b1=C11I_ad
+ vextf c08, 1, b2 # b2=C11R_bd
+ vextf c08, 2, b3 # b3=C21I_ad
+ vextf c08, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 4 * SIZE(C2)
+ ST c02, 5 * SIZE(C2)
+ ST c05, 6 * SIZE(C2)
+ ST c06, 7 * SIZE(C2)
+
+ vextf c11, 0, a1 # a1=C11R_ac
+ vextf c11, 1, a2 # a2=C11I_bc
+ vextf c11, 2, a3 # a3=C21R_ac
+ vextf c11, 3, a4 # a4=C21I_bc
+
+ vextf c12, 0, b1 # b1=C11I_ad
+ vextf c12, 1, b2 # b2=C11R_bd
+ vextf c12, 2, b3 # b3=C21I_ad
+ vextf c12, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 8 * SIZE(C2)
+ ST c02, 9 * SIZE(C2)
+ ST c05, 10 * SIZE(C2)
+ ST c06, 11 * SIZE(C2)
+
+ vextf c15, 0, a1 # a1=C11R_ac
+ vextf c15, 1, a2 # a2=C11I_bc
+ vextf c15, 2, a3 # a3=C21R_ac
+ vextf c15, 3, a4 # a4=C21I_bc
+
+ vextf c16, 0, b1 # b1=C11I_ad
+ vextf c16, 1, b2 # b2=C11R_bd
+ vextf c16, 2, b3 # b3=C21I_ad
+ vextf c16, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 12 * SIZE(C2)
+ ST c02, 13 * SIZE(C2)
+ ST c05, 14 * SIZE(C2)
+ ST c06, 15 * SIZE(C2)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 8, TEMP
+#else
+ subl TEMP, 2, TEMP
+#endif
+
+ sll TEMP, 3 + ZBASE_SHIFT,L # mr=8
+ sll TEMP, 1 + ZBASE_SHIFT,TEMP # nr=2
+
+ addl AO, L, AO
+ addl BO, TEMP, BO
+#endif
+
+#ifdef LEFT
+ addl KK,8,KK
+#endif
+#endif
+
+ jmp $L09
+
+
+ .align 4
+
+$L20: # N=2, M=4
+ and M, 4, I # I=M&4
+ ble I, $L30
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B, BO
+ nop
+#else
+ sll KK, 2 + ZBASE_SHIFT, L # mr=4
+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2
+
+ addl AO, L, AO
+ addl B, TEMP, BO
+#endif
+ fillcs 0(C1)
+ fillcs 4*SIZE(C1)
+ fillcs 8*SIZE(C1)
+
+ vcpys $f31,$f31,c01 # Clear result regs
+ vcpys $f31,$f31,c02
+ vcpys $f31,$f31,c03
+ vcpys $f31,$f31,c04
+
+ fillcs 0(C2)
+ fillcs 4*SIZE(C2)
+ fillcs 8*SIZE(C2)
+
+ vcpys $f31,$f31,c05
+ vcpys $f31,$f31,c06
+ vcpys $f31,$f31,c07
+ vcpys $f31,$f31,c08
+
+ LDDE b1, 0 * SIZE(BO) # B1R
+ LDDE b2, 1 * SIZE(BO) # B1I
+ LDDE b3, 2 * SIZE(BO) # B2R
+ LDDE b4, 3 * SIZE(BO) # B2I
+
+ VLD a1, 0 * SIZE(AO) # A1, A2
+ VLD a2, 4 * SIZE(AO) # A3, A4
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl K, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 4, TEMP # mr=4
+#else
+ addl KK, 2,TEMP # nr=2
+#endif
+ sra TEMP, 1, L
+ ble L, $L25
+
+#else
+ mov B, BO # Set B, (block A x panel Bj)
+ sra K, 1, L # Unroll K as 2
+
+ fillcs 0(C1)
+ fillcs 4*SIZE(C1)
+ fillcs 8*SIZE(C1)
+
+ vcpys $f31,$f31,c01 # Clear result regs
+ vcpys $f31,$f31,c02
+ vcpys $f31,$f31,c03
+ vcpys $f31,$f31,c04
+
+ fillcs 0(C2)
+ fillcs 4*SIZE(C2)
+ fillcs 8*SIZE(C2)
+
+ vcpys $f31,$f31,c05
+ vcpys $f31,$f31,c06
+ vcpys $f31,$f31,c07
+ vcpys $f31,$f31,c08
+
+ LDDE b1, 0 * SIZE(BO) # B1R
+ LDDE b2, 1 * SIZE(BO) # B1I
+ LDDE b3, 2 * SIZE(BO) # B2R
+ LDDE b4, 3 * SIZE(BO) # B2I
+
+ VLD a1, 0 * SIZE(AO) # A1, A2
+ VLD a2, 4 * SIZE(AO) # A3, A4
+
+ ble L, $L25
+#endif
+
+ .align 4
+$L22:
+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc)
+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd)
+ VMAD a1,b3,c03,c03 # C12(ac,bc), C22(ac,bc)
+ VMAD a1,b4,c04,c04 # C12(ad,bd), C22(ad,bd)
+
+ LDDE b5, 4 * SIZE(BO) # next B1R
+ LDDE b6, 5 * SIZE(BO) # next B1I
+ LDDE b7, 6 * SIZE(BO) # next B2R
+ LDDE b8, 7 * SIZE(BO) # next B2I
+
+ fillcs 0(PREB)
+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
+ VMAD a2,b1,c05,c05 # C31, C41
+ VMAD a2,b2,c06,c06 # C31, C41
+
+ fillcs 0(PREA)
+ VMAD a2,b3,c07,c07 # C32, C42
+ VMAD a2,b4,c08,c08 # C32, C42
+
+ VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0
+ VLD a6, 12 * SIZE(AO) # next A3, A4
+
+ subl L, 1, L #
+
+ addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE
+ VMAD a5,b5,c01,c01
+ VMAD a5,b6,c02,c02
+
+ addl PREA, 16*SIZE, PREA
+ VMAD a5,b7,c03,c03
+ VMAD a5,b8,c04,c04
+
+ LDDE b1, 0 * SIZE(BO)
+ LDDE b2, 1 * SIZE(BO)
+ LDDE b3, 2 * SIZE(BO)
+ LDDE b4, 3 * SIZE(BO)
+
+ fillcs 4*SIZE(PREB)
+ VMAD a6,b5,c05,c05
+ VMAD a6,b6,c06,c06
+
+ fillcs 0(PREA)
+ VMAD a6,b7,c07,c07
+ VMAD a6,b8,c08,c08
+
+ VLD a1, 0 * SIZE(AO)
+ VLD a2, 4 * SIZE(AO)
+
+ addl PREB, 8*SIZE, PREB
+ addl PREA, 16*SIZE, PREA
+ bne L, $L22 # continue K
+
+$L25:
+ LD alpha_r, ALPHA_R # $f30==b8
+#ifndef TRMMKERNEL
+ blbc K, $L28 # if(K&1)
+#else
+ blbc TEMP, $L28
+#endif
+
+$L26:
+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE
+ VMAD a1,b1,c01,c01 # C11R C21R
+ VMAD a1,b2,c02,c02 # C11I C21I
+ VMAD a1,b3,c03,c03 # C12R c22R
+ VMAD a1,b4,c04,c04 # C12I C22I
+
+ addl BO, 4*SIZE, BO
+ VMAD a2,b1,c05,c05 # C31R C41R
+ VMAD a2,b2,c06,c06 # C31I C41I
+ VMAD a2,b3,c07,c07 # C32R C42R
+ VMAD a2,b4,c08,c08 # C32I C42I
+
+$L28: # Write back
+ LD alpha_i, ALPHA_I # $f29==b7
+#ifndef TRMMKERNEL
+ vextf c01, 0, a1 # a1=C11R_ac
+ vextf c01, 1, a2 # a2=C11I_bc
+ vextf c01, 2, a3 # a3=C21R_ac
+ vextf c01, 3, a4 # a4=C21I_bc
+
+ vextf c02, 0, b1 # b1=C11I_ad
+ vextf c02, 1, b2 # b2=C11R_bd
+ vextf c02, 2, b3 # b3=C21I_ad
+ vextf c02, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 0 * SIZE(C1)
+ LD a2, 1 * SIZE(C1)
+ LD a3, 2 * SIZE(C1)
+ LD a4, 3 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 0 * SIZE(C1)
+ ST c01, 1 * SIZE(C1)
+ ST b6, 2 * SIZE(C1)
+ ST c02, 3 * SIZE(C1)
+
+ vextf c05, 0, a1 # a1=C11R_ac
+ vextf c05, 1, a2 # a2=C11I_bc
+ vextf c05, 2, a3 # a3=C21R_ac
+ vextf c05, 3, a4 # a4=C21I_bc
+
+ vextf c06, 0, b1 # b1=C11I_ad
+ vextf c06, 1, b2 # b2=C11R_bd
+ vextf c06, 2, b3 # b3=C21I_ad
+ vextf c06, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 4 * SIZE(C1)
+ LD a2, 5 * SIZE(C1)
+ LD a3, 6 * SIZE(C1)
+ LD a4, 7 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 4 * SIZE(C1)
+ ST c01, 5 * SIZE(C1)
+ ST b6, 6 * SIZE(C1)
+ ST c02, 7 * SIZE(C1)
+
+
+ vextf c03, 0, a1 # a1=C11R_ac
+ vextf c03, 1, a2 # a2=C11I_bc
+ vextf c03, 2, a3 # a3=C21R_ac
+ vextf c03, 3, a4 # a4=C21I_bc
+
+ vextf c04, 0, b1 # b1=C11I_ad
+ vextf c04, 1, b2 # b2=C11R_bd
+ vextf c04, 2, b3 # b3=C21I_ad
+ vextf c04, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 0 * SIZE(C2)
+ LD a2, 1 * SIZE(C2)
+ LD a3, 2 * SIZE(C2)
+ LD a4, 3 * SIZE(C2)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 0 * SIZE(C2)
+ ST c02, 1 * SIZE(C2)
+ ST c05, 2 * SIZE(C2)
+ ST c06, 3 * SIZE(C2)
+
+ vextf c07, 0, a1 # a1=C11R_ac
+ vextf c07, 1, a2 # a2=C11I_bc
+ vextf c07, 2, a3 # a3=C21R_ac
+ vextf c07, 3, a4 # a4=C21I_bc
+
+ vextf c08, 0, b1 # b1=C11I_ad
+ vextf c08, 1, b2 # b2=C11R_bd
+ vextf c08, 2, b3 # b3=C21I_ad
+ vextf c08, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 4 * SIZE(C2)
+ LD a2, 5 * SIZE(C2)
+ LD a3, 6 * SIZE(C2)
+ LD a4, 7 * SIZE(C2)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 4 * SIZE(C2)
+ ST c02, 5 * SIZE(C2)
+ ST c05, 6 * SIZE(C2)
+ ST c06, 7 * SIZE(C2)
+
+#else
+
+ vextf c01, 0, a1 # a1=C11R_ac
+ vextf c01, 1, a2 # a2=C11I_bc
+ vextf c01, 2, a3 # a3=C21R_ac
+ vextf c01, 3, a4 # a4=C21I_bc
+
+ vextf c02, 0, b1 # b1=C11I_ad
+ vextf c02, 1, b2 # b2=C11R_bd
+ vextf c02, 2, b3 # b3=C21I_ad
+ vextf c02, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 0 * SIZE(C1)
+ ST c01, 1 * SIZE(C1)
+ ST b6, 2 * SIZE(C1)
+ ST c02, 3 * SIZE(C1)
+
+ vextf c05, 0, a1 # a1=C11R_ac
+ vextf c05, 1, a2 # a2=C11I_bc
+ vextf c05, 2, a3 # a3=C21R_ac
+ vextf c05, 3, a4 # a4=C21I_bc
+
+ vextf c06, 0, b1 # b1=C11I_ad
+ vextf c06, 1, b2 # b2=C11R_bd
+ vextf c06, 2, b3 # b3=C21I_ad
+ vextf c06, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 4 * SIZE(C1)
+ ST c01, 5 * SIZE(C1)
+ ST b6, 6 * SIZE(C1)
+ ST c02, 7 * SIZE(C1)
+
+
+ vextf c03, 0, a1 # a1=C11R_ac
+ vextf c03, 1, a2 # a2=C11I_bc
+ vextf c03, 2, a3 # a3=C21R_ac
+ vextf c03, 3, a4 # a4=C21I_bc
+
+ vextf c04, 0, b1 # b1=C11I_ad
+ vextf c04, 1, b2 # b2=C11R_bd
+ vextf c04, 2, b3 # b3=C21I_ad
+ vextf c04, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 0 * SIZE(C2)
+ ST c02, 1 * SIZE(C2)
+ ST c05, 2 * SIZE(C2)
+ ST c06, 3 * SIZE(C2)
+
+ vextf c07, 0, a1 # a1=C11R_ac
+ vextf c07, 1, a2 # a2=C11I_bc
+ vextf c07, 2, a3 # a3=C21R_ac
+ vextf c07, 3, a4 # a4=C21I_bc
+
+ vextf c08, 0, b1 # b1=C11I_ad
+ vextf c08, 1, b2 # b2=C11R_bd
+ vextf c08, 2, b3 # b3=C21I_ad
+ vextf c08, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, c01
+ FMAD8 a8, alpha_i, a3, c05
+ FMAD6 b5, alpha_i, a2, c02
+ FMAD6 a6, alpha_i, a4, c06
+
+ ST c01, 4 * SIZE(C2)
+ ST c02, 5 * SIZE(C2)
+ ST c05, 6 * SIZE(C2)
+ ST c06, 7 * SIZE(C2)
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 4, TEMP
+#else
+ subl TEMP, 2, TEMP
+#endif
+
+ sll TEMP, 2 + ZBASE_SHIFT, L
+ sll TEMP, 1 + ZBASE_SHIFT, TEMP
+
+ addl AO, L, AO
+ addl BO, TEMP,BO
+#endif
+
+#ifdef LEFT
+ addl KK, 4,KK
+#endif
+#endif
+
+ addl C1, 8*SIZE, C1
+ addl C2, 8*SIZE, C2
+
+
+ .align 4
+$L30:
+ and M, 2, I # I=M&2
+ ble I, $L40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B, BO
+ nop
+#else
+ sll KK, 1 + ZBASE_SHIFT, L # mr=2
+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2
+
+ addl AO, L, AO
+ addl B, TEMP, BO
+#endif
+
+ fclr c01
+ fclr c02
+ fclr c03
+ fclr c04
+ fclr c05
+ fclr c06
+ fclr c07
+ fclr c08 # CLEAR 8 register
+ fclr c09
+ fclr c10
+ fclr c11
+ fclr c12
+ fclr c13
+ fclr c14
+ fclr c15
+ fclr c16
+
+ fillcs 0*SIZE(C1)
+ fillcs 4*SIZE(C1)
+
+ LD b1, 0*SIZE(BO) # b1 real part
+ LD b2, 1*SIZE(BO) # b1 image part
+ LD b3, 2*SIZE(BO) # b2 real part
+ LD b4, 3*SIZE(BO) # b2 image part
+
+ fillcs 0*SIZE(C2)
+ fillcs 4*SIZE(C2)
+
+ LD a1, 0*SIZE(AO) # a1 real part
+ LD a2, 1*SIZE(AO) # a1 image part
+ LD a3, 2*SIZE(AO) # a2 real part
+ LD a4, 3*SIZE(AO) # a2 image part
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl K, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 2, TEMP # mr=2
+#else
+ addl KK, 2, TEMP # nr=2
+#endif
+ sra TEMP, 1, L
+ ble L, $L35
+
+#else
+
+ mov B, BO # Set B, (block A x panel Bj)
+ sra K, 1, L # Unroll K as 2
+
+ fclr c01
+ fclr c02
+ fclr c03
+ fclr c04
+ fclr c05
+ fclr c06
+ fclr c07
+ fclr c08 # CLEAR 8 register
+ fclr c09
+ fclr c10
+ fclr c11
+ fclr c12
+ fclr c13
+ fclr c14
+ fclr c15
+ fclr c16
+
+ fillcs 0*SIZE(C1)
+ fillcs 4*SIZE(C1)
+
+ LD b1, 0*SIZE(BO) # b1 real part
+ LD b2, 1*SIZE(BO) # b1 image part
+ LD b3, 2*SIZE(BO) # b2 real part
+ LD b4, 3*SIZE(BO) # b2 image part
+
+ fillcs 0*SIZE(C2)
+ fillcs 4*SIZE(C2)
+
+ LD a1, 0*SIZE(AO) # a1 real part
+ LD a2, 1*SIZE(AO) # a1 image part
+ LD a3, 2*SIZE(AO) # a2 real part
+ LD a4, 3*SIZE(AO) # a2 image part
+
+ ble L, $L35
+#endif
+
+ .align 4
+$L32:
+ MAD a1,b1,c01,c01 # a1*c1
+ MAD a1,b2,c02,c02 # a1*d1
+ MAD a1,b3,c03,c03 # a1*c2
+ MAD a1,b4,c04,c04 # a1*d2
+
+ LD b5, 4 * SIZE(BO) # next B1R
+ LD b6, 5 * SIZE(BO) # next B1I
+ LD b7, 6 * SIZE(BO) # next B2R
+ LD b8, 7 * SIZE(BO) # next B2I
+
+ LD a5, 4 * SIZE(AO) # next A1-A4 real part
+ LD a6, 5 * SIZE(AO) # next A1-A4 image part
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
+ MAD a2,b1,c05,c05 # b1*c1
+ MAD a2,b2,c06,c06 # b1*d1
+ MAD a2,b3,c07,c07 # b1*c2
+ MAD a2,b4,c08,c08 # b1*d2
+
+ MAD a3,b1,c09,c09 # a2*c1
+ MAD a3,b2,c10,c10 # a2*d1
+ MAD a3,b3,c11,c11 # a2*c2
+ MAD a3,b4,c12,c12 # a2*d2
+
+ MAD a4,b1,c13,c13 # b2*c1
+ MAD a4,b2,c14,c14 # b2*d1
+ MAD a4,b3,c15,c15 # b2*c2
+ MAD a4,b4,c16,c16 # b2*d2
+
+ subl L, 1, L #
+
+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE
+ MAD a5,b5,c01,c01
+ MAD a5,b6,c02,c02
+ MAD a5,b7,c03,c03
+ MAD a5,b8,c04,c04
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MAD a6,b5,c05,c05
+ MAD a6,b6,c06,c06
+ MAD a6,b7,c07,c07
+ MAD a6,b8,c08,c08
+
+ MAD a7,b5,c09,c09
+ MAD a7,b6,c10,c10
+ MAD a7,b7,c11,c11
+ MAD a7,b8,c12,c12
+
+ MAD a8,b5,c13,c13
+ MAD a8,b6,c14,c14
+ MAD a8,b7,c15,c15
+ MAD a8,b8,c16,c16
+
+ bne L, $L32 # continue K
+
+$L35:
+ LD alpha_r, ALPHA_R # $f30==b8
+#ifndef TRMMKERNEL
+ blbc K, $L38 # if(K&1)
+#else
+ blbc TEMP, $L38
+#endif
+
+$L36:
+ addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE
+ addl BO, 4*SIZE, BO
+
+ MAD a1,b1,c01,c01 # a1*c1
+ MAD a1,b2,c02,c02 # a1*d1
+ MAD a1,b3,c03,c03 # a1*c2
+ MAD a1,b4,c04,c04 # a1*d2
+
+ MAD a2,b1,c05,c05 # b1*c1
+ MAD a2,b2,c06,c06 # b1*d1
+ MAD a2,b3,c07,c07 # b1*c2
+ MAD a2,b4,c08,c08 # b1*d2
+
+ MAD a3,b1,c09,c09 # a2*c1
+ MAD a3,b2,c10,c10 # a2*d1
+ MAD a3,b3,c11,c11 # a2*c2
+ MAD a3,b4,c12,c12 # a2*d2
+
+ MAD a4,b1,c13,c13 # b2*c1
+ MAD a4,b2,c14,c14 # b2*d1
+ MAD a4,b3,c15,c15 # b2*c2
+ MAD a4,b4,c16,c16 # b2*d2
+
+
+
+$L38: # Write back
+ LD alpha_i, ALPHA_I # $f29==b7
+#ifndef TRMMKERNEL
+ ADD1 c01, c06, c01 # ac '+' bd
+ ADD1 c09, c14, c09
+ ADD1 c03, c08, c03 #
+ ADD1 c11, c16, c11
+
+ ADD2 c05, c02, c02 # bc '+' ad
+ ADD2 c13, c10, c10
+ ADD2 c07, c04, c04
+ ADD2 c15, c12, c12
+
+ LD b1, 0 * SIZE(C1)
+ LD b2, 1 * SIZE(C1)
+ LD b3, 2 * SIZE(C1)
+ LD b4, 3 * SIZE(C1)
+
+ LD a5, 0 * SIZE(C2)
+ LD a6, 1 * SIZE(C2)
+ LD a7, 2 * SIZE(C2)
+ LD a8, 3 * SIZE(C2)
+
+ FMAD5 c01, alpha_r, b1, b1
+ FMAD5 c09, alpha_r, b3, b3
+ FMAD5 c03, alpha_r, a5, a5
+ FMAD5 c11, alpha_r, a7, a7
+
+ FMAD7 c02, alpha_r, b2, b2
+ FMAD7 c10, alpha_r, b4, b4
+ FMAD7 c04, alpha_r, a6, a6
+ FMAD7 c12, alpha_r, a8, a8
+
+ FMAD8 c02, alpha_i, b1, b1
+ FMAD8 c10, alpha_i, b3, b3
+ FMAD8 c04, alpha_i, a5, a5
+ FMAD8 c12, alpha_i, a7, a7
+
+ FMAD6 c01, alpha_i, b2, b2
+ FMAD6 c09, alpha_i, b4, b4
+ FMAD6 c03, alpha_i, a6, a6
+ FMAD6 c11, alpha_i, a8, a8
+
+ ST b1, 0 * SIZE(C1)
+ ST b2, 1 * SIZE(C1)
+ ST b3, 2 * SIZE(C1)
+ ST b4, 3 * SIZE(C1)
+
+ ST a5, 0 * SIZE(C2)
+ ST a6, 1 * SIZE(C2)
+ ST a7, 2 * SIZE(C2)
+ ST a8, 3 * SIZE(C2)
+
+#else
+
+ ADD1 c01, c06, c01 # ac '+' bd
+ ADD1 c09, c14, c09
+ ADD1 c03, c08, c03 #
+ ADD1 c11, c16, c11
+
+ ADD2 c05, c02, c02 # bc '+' ad
+ ADD2 c13, c10, c10
+ ADD2 c07, c04, c04
+ ADD2 c15, c12, c12
+
+ FMAD5 c01, alpha_r, $f31, b1
+ FMAD5 c09, alpha_r, $f31, b3
+ FMAD5 c03, alpha_r, $f31, a5
+ FMAD5 c11, alpha_r, $f31, a7
+
+ FMAD7 c02, alpha_r, $f31, b2
+ FMAD7 c10, alpha_r, $f31, b4
+ FMAD7 c04, alpha_r, $f31, a6
+ FMAD7 c12, alpha_r, $f31, a8
+
+ FMAD8 c02, alpha_i, b1, b1
+ FMAD8 c10, alpha_i, b3, b3
+ FMAD8 c04, alpha_i, a5, a5
+ FMAD8 c12, alpha_i, a7, a7
+
+ FMAD6 c01, alpha_i, b2, b2
+ FMAD6 c09, alpha_i, b4, b4
+ FMAD6 c03, alpha_i, a6, a6
+ FMAD6 c11, alpha_i, a8, a8
+
+ ST b1, 0 * SIZE(C1)
+ ST b2, 1 * SIZE(C1)
+ ST b3, 2 * SIZE(C1)
+ ST b4, 3 * SIZE(C1)
+
+ ST a5, 0 * SIZE(C2)
+ ST a6, 1 * SIZE(C2)
+ ST a7, 2 * SIZE(C2)
+ ST a8, 3 * SIZE(C2)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 2, TEMP
+#else
+ subl TEMP, 2, TEMP
+#endif
+
+ sll TEMP, 1 + ZBASE_SHIFT, L
+ sll TEMP, 1 + ZBASE_SHIFT, TEMP
+
+ addl AO, L, AO
+ addl BO, TEMP, BO
+#endif
+
+#ifdef LEFT
+ addl KK, 2, KK
+#endif
+#endif
+
+ addl C1, 4*SIZE, C1
+ addl C2, 4*SIZE, C2
+
+
+ .align 4
+$L40:
+ and M, 1, I # I=M&1
+ ble I, $L09
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B, BO
+ nop
+#else
+ sll KK, ZBASE_SHIFT, L # mr=1
+ sll KK, 1 + ZBASE_SHIFT,TEMP # nr=2
+
+ addl AO, L, AO
+ addl B, TEMP, BO
+#endif
+
+ fillcs 0*SIZE(C1)
+ fillcs 0*SIZE(C2)
+
+ fclr c01
+ fclr c02
+ fclr c03
+ fclr c04
+ fclr c05
+ fclr c06
+ fclr c07
+ fclr c08
+
+ LD b1, 0*SIZE(BO) # b1 real part
+ LD b2, 1*SIZE(BO) # b1 image part
+ LD b3, 2*SIZE(BO) # b2 real part
+ LD b4, 3*SIZE(BO) # b2 image part
+
+ LD a1, 0*SIZE(AO) # a1 real part
+ LD a2, 1*SIZE(AO) # a1 image part
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl K, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 1, TEMP # mr=1
+#else
+ addl KK, 2, TEMP # nr=2
+#endif
+ sra TEMP, 1, L
+
+ ble L, $L45
+
+#else
+ mov B, BO # Set B, (block A x panel Bj)
+ sra K, 1, L # Unroll K as 2
+
+ fillcs 0*SIZE(C1)
+ fillcs 0*SIZE(C2)
+
+ fclr c01
+ fclr c02
+ fclr c03
+ fclr c04
+ fclr c05
+ fclr c06
+ fclr c07
+ fclr c08
+
+ LD b1, 0*SIZE(BO) # b1 real part
+ LD b2, 1*SIZE(BO) # b1 image part
+ LD b3, 2*SIZE(BO) # b2 real part
+ LD b4, 3*SIZE(BO) # b2 image part
+
+ LD a1, 0*SIZE(AO) # a1 real part
+ LD a2, 1*SIZE(AO) # a1 image part
+
+ ble L, $L45
+#endif
+
+ .align 4
+$L42:
+ MAD a1,b1,c01,c01 # C11 real part
+ MAD a1,b2,c02,c02 # C11 imag part
+ MAD a1,b3,c03,c03 # C21 real part
+ MAD a1,b4,c04,c04 # C21 imag part
+
+ LD b5, 4 * SIZE(BO) # next B1R
+ LD b6, 5 * SIZE(BO) # next B1I
+ LD b7, 6 * SIZE(BO) # next B2R
+ LD b8, 7 * SIZE(BO) # next B2I
+
+ LD a5, 2 * SIZE(AO) # next A1-A4 real part
+ LD a6, 3 * SIZE(AO) # next A1-A4 image part
+
+ addl BO, 8*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
+ MAD a2,b1,c05,c05 # C11 image part
+ MAD a2,b2,c06,c06 # C11 real part
+ MAD a2,b3,c07,c07 # C21 image part
+ MAD a2,b4,c08,c08 # C21 real part
+
+ subl L, 1, L #
+
+ addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE
+ MAD a5,b5,c01,c01
+ MAD a5,b6,c02,c02
+ MAD a5,b7,c03,c03
+ MAD a5,b8,c04,c04
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+ LD b3, 2 * SIZE(BO)
+ LD b4, 3 * SIZE(BO)
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MAD a6,b5,c05,c05
+ MAD a6,b6,c06,c06
+ MAD a6,b7,c07,c07
+ MAD a6,b8,c08,c08
+
+ bne L, $L42 # continue K
+
+$L45:
+ LD alpha_r, ALPHA_R # $f30==b8
+#ifndef TRMMKERNEL
+ blbc K, $L48 # if(K&1)
+#else
+ blbc TEMP, $L48
+#endif
+
+$L46:
+ addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE
+ MAD a1,b1,c01,c01 # C11 real part
+ MAD a1,b2,c02,c02 # C11 imag part
+ MAD a1,b3,c03,c03 # C21 real part
+ MAD a1,b4,c04,c04 # C21 imag part
+
+ addl BO, 4*SIZE, BO
+ MAD a2,b1,c05,c05 # C11 image part
+ MAD a2,b2,c06,c06 # C11 real part
+ MAD a2,b3,c07,c07 # C21 image part
+ MAD a2,b4,c08,c08 # C21 real part
+
+
+$L48: # Write back
+ LD alpha_i, ALPHA_I # $f29==b7
+#ifndef TRMMKERNEL
+ ADD1 c01, c06, c01
+ ADD1 c03, c08, c03
+ ADD2 c05, c02, c02
+ ADD2 c07, c04, c04
+
+ LD b1, 0 * SIZE(C1)
+ LD b2, 1 * SIZE(C1)
+
+ LD a5, 0 * SIZE(C2)
+ LD a6, 1 * SIZE(C2)
+
+ FMAD5 c01, alpha_r, b1, b1
+ FMAD5 c03, alpha_r, a5, a5
+
+ FMAD7 c02, alpha_r, b2, b2
+ FMAD7 c04, alpha_r, a6, a6
+
+ FMAD8 c02, alpha_i, b1, b1
+ FMAD8 c04, alpha_i, a5, a5
+
+ FMAD6 c01, alpha_i, b2, b2
+ FMAD6 c03, alpha_i, a6, a6
+
+ ST b1, 0 * SIZE(C1)
+ ST b2, 1 * SIZE(C1)
+
+ ST a5, 0 * SIZE(C2)
+ ST a6, 1 * SIZE(C2)
+
+#else
+
+ ADD1 c01, c06, c01
+ ADD1 c03, c08, c03
+ ADD2 c05, c02, c02
+ ADD2 c07, c04, c04
+
+ FMAD5 c01, alpha_r, $f31, b1
+ FMAD5 c03, alpha_r, $f31, a5
+
+ FMAD7 c02, alpha_r, $f31, b2
+ FMAD7 c04, alpha_r, $f31, a6
+
+ FMAD8 c02, alpha_i, b1, b1
+ FMAD8 c04, alpha_i, a5, a5
+
+ FMAD6 c01, alpha_i, b2, b2
+ FMAD6 c03, alpha_i, a6, a6
+
+ ST b1, 0 * SIZE(C1)
+ ST b2, 1 * SIZE(C1)
+
+ ST a5, 0 * SIZE(C2)
+ ST a6, 1 * SIZE(C2)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 1, TEMP
+#else
+ subl TEMP, 2, TEMP
+#endif
+
+ sll TEMP, ZBASE_SHIFT, L
+ sll TEMP, 1 + ZBASE_SHIFT, TEMP
+
+ addl AO, L, AO
+ addl BO, TEMP,BO
+#endif
+
+#ifdef LEFT
+ addl KK, 1, KK
+#endif
+#endif
+
+ addl C1, 2*SIZE, C1
+ addl C2, 2*SIZE, C2
+
+
+ .align 4
+
+$L09:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ addl KK, 2, KK # nr=2
+ nop
+#endif
+ mov BO, B # Change B to next panel
+ subl J, 1, J # J--
+ bgt J, $L01
+
+
+ .align 4
+$L50:
+ and N, 1, J
+ ble J, $L999 # Finish!
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ mov OFFSET, KK # reset KK
+#endif
+
+ sra M, 3, I # I=M/8
+ sll K, 1 + ZBASE_SHIFT, PREA
+
+ mov C, C1
+ mov A, AO # Reset A
+
+ addl A, PREA, PREA
+ beq I, $L60 # GEMM_MR=8
+
+
+$L51:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA))\
+ || (!defined(LEFT) && !defined(TRANSA))
+ mov B, BO
+#else
+ sll KK, 3 + ZBASE_SHIFT,L # mr=8
+ sll KK, ZBASE_SHIFT,TEMP # nr=1
+
+ addl AO, L, AO
+ addl B, TEMP, BO
+#endif
+
+ fillcs 0(C1)
+ fillcs 4*SIZE(C1)
+ fillcs 8*SIZE(C1)
+ fillcs 12*SIZE(C1)
+ fillcs 16*SIZE(C1)
+
+ vcpys $f31,$f31,c01 # Clear result regs
+ vcpys $f31,$f31,c02
+
+ vcpys $f31,$f31,c05
+ vcpys $f31,$f31,c06
+
+ vcpys $f31,$f31,c09
+ vcpys $f31,$f31,c10
+
+ vcpys $f31,$f31,c13
+ vcpys $f31,$f31,c14
+
+ LDDE b1, 0 * SIZE(BO) # B1R
+ LDDE b2, 1 * SIZE(BO) # B1I
+
+ VLD a1, 0 * SIZE(AO) # A1, A2
+ VLD a2, 4 * SIZE(AO) # A3, A4
+ VLD a3, 8 * SIZE(AO) # A5, A6
+ VLD a4,12 * SIZE(AO) # A7, A8
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl K, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 8, TEMP # mr=8
+#else
+ addl KK, 1, TEMP # nr=1
+#endif
+ sra TEMP, 1, L
+ ble L, $L55
+
+#else
+ mov B, BO # Set B, (block A x panel Bj)
+ sra K, 1, L # Unroll K as 2
+
+ fillcs 0(C1)
+ fillcs 4*SIZE(C1)
+ fillcs 8*SIZE(C1)
+ fillcs 12*SIZE(C1)
+ fillcs 16*SIZE(C1)
+
+ vcpys $f31,$f31,c01 # Clear result regs
+ vcpys $f31,$f31,c02
+
+ vcpys $f31,$f31,c05
+ vcpys $f31,$f31,c06
+
+ vcpys $f31,$f31,c09
+ vcpys $f31,$f31,c10
+
+ vcpys $f31,$f31,c13
+ vcpys $f31,$f31,c14
+
+ LDDE b1, 0 * SIZE(BO) # B1R
+ LDDE b2, 1 * SIZE(BO) # B1I
+
+ VLD a1, 0 * SIZE(AO) # A1, A2
+ VLD a2, 4 * SIZE(AO) # A3, A4
+ VLD a3, 8 * SIZE(AO) # A5, A6
+ VLD a4,12 * SIZE(AO) # A7, A8
+
+ ble L, $L55
+#endif
+
+ .align 4
+$L52:
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc)
+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd)
+
+ LDDE b5, 2 * SIZE(BO) # next B1R
+ LDDE b6, 3 * SIZE(BO) # next B1I
+
+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE
+ VMAD a2,b1,c05,c05 # C31, C41
+ VMAD a2,b2,c06,c06 # C31, C41
+
+ VLD a5, 0 * SIZE(AO) # next A1, A2, a5==a0
+ VLD a6, 4 * SIZE(AO) # next A3, A4
+ VLD a7, 8 * SIZE(AO) # next A5, A6
+ VLD a8,12 * SIZE(AO) # next A7, A8
+
+ VMAD a3,b1,c09,c09 # C51, C61
+ VMAD a3,b2,c10,c10 # C51, C61
+
+ fillcs 0(PREA)
+ VMAD a4,b1,c13,c13 # C71, C81
+ VMAD a4,b2,c14,c14 # C71, C81
+
+ subl L, 1, L #
+
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
+ VMAD a5,b5,c01,c01
+ VMAD a5,b6,c02,c02
+
+ addl PREA, 16*SIZE, PREA
+ LDDE b1, 0 * SIZE(BO)
+ LDDE b2, 1 * SIZE(BO)
+
+ VMAD a6,b5,c05,c05
+ VMAD a6,b6,c06,c06
+
+ VLD a1, 0 * SIZE(AO)
+ VLD a2, 4 * SIZE(AO)
+ VLD a3, 8 * SIZE(AO)
+ VLD a4,12 * SIZE(AO)
+
+ VMAD a7,b5,c09,c09
+ VMAD a7,b6,c10,c10
+
+ fillcs 0(PREA)
+ VMAD a8,b5,c13,c13
+ VMAD a8,b6,c14,c14
+
+ addl PREA, 16*SIZE, PREA
+ bne L, $L52 # continue K
+
+$L55:
+ LD alpha_r, ALPHA_R # $f30==b8
+#ifndef TRMMKERNEL
+ blbc K, $L58 # if(K&1)
+#else
+ blbc TEMP, $L58
+#endif
+
+$L56:
+ addl AO, 16*SIZE, AO # AO+=8mr*1kr*2px*SIZE
+ VMAD a1,b1,c01,c01 # C11R C21R
+ VMAD a1,b2,c02,c02 # C11I C21I
+
+ addl BO, 2*SIZE, BO
+ VMAD a2,b1,c05,c05 # C31R C41R
+ VMAD a2,b2,c06,c06 # C31I C41I
+
+ VMAD a3,b1,c09,c09 # C51R C61R
+ VMAD a3,b2,c10,c10 # C51I C61I
+
+ VMAD a4,b1,c13,c13 # C71R C81R
+ VMAD a4,b2,c14,c14 # C71I C81I
+
+$L58: # Write back
+ LD alpha_i, ALPHA_I # $f29==b7
+#ifndef TRMMKERNEL
+ vextf c01, 0, a1 # a1=C11R_ac
+ vextf c01, 1, a2 # a2=C11I_bc
+ vextf c01, 2, a3 # a3=C21R_ac
+ vextf c01, 3, a4 # a4=C21I_bc
+
+ vextf c02, 0, b1 # b1=C11I_ad
+ vextf c02, 1, b2 # b2=C11R_bd
+ vextf c02, 2, b3 # b3=C21I_ad
+ vextf c02, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 0 * SIZE(C1)
+ LD a2, 1 * SIZE(C1)
+ LD a3, 2 * SIZE(C1)
+ LD a4, 3 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 0 * SIZE(C1)
+ ST c01, 1 * SIZE(C1)
+ ST b6, 2 * SIZE(C1)
+ ST c02, 3 * SIZE(C1)
+
+ vextf c05, 0, a1 # a1=C11R_ac
+ vextf c05, 1, a2 # a2=C11I_bc
+ vextf c05, 2, a3 # a3=C21R_ac
+ vextf c05, 3, a4 # a4=C21I_bc
+
+ vextf c06, 0, b1 # b1=C11I_ad
+ vextf c06, 1, b2 # b2=C11R_bd
+ vextf c06, 2, b3 # b3=C21I_ad
+ vextf c06, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 4 * SIZE(C1)
+ LD a2, 5 * SIZE(C1)
+ LD a3, 6 * SIZE(C1)
+ LD a4, 7 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 4 * SIZE(C1)
+ ST c01, 5 * SIZE(C1)
+ ST b6, 6 * SIZE(C1)
+ ST c02, 7 * SIZE(C1)
+
+ vextf c09, 0, a1 # a1=C11R_ac
+ vextf c09, 1, a2 # a2=C11I_bc
+ vextf c09, 2, a3 # a3=C21R_ac
+ vextf c09, 3, a4 # a4=C21I_bc
+
+ vextf c10, 0, b1 # b1=C11I_ad
+ vextf c10, 1, b2 # b2=C11R_bd
+ vextf c10, 2, b3 # b3=C21I_ad
+ vextf c10, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 8 * SIZE(C1)
+ LD a2, 9 * SIZE(C1)
+ LD a3, 10 * SIZE(C1)
+ LD a4, 11 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 8 * SIZE(C1)
+ ST c01, 9 * SIZE(C1)
+ ST b6, 10 * SIZE(C1)
+ ST c02, 11 * SIZE(C1)
+
+ vextf c13, 0, a1 # a1=C11R_ac
+ vextf c13, 1, a2 # a2=C11I_bc
+ vextf c13, 2, a3 # a3=C21R_ac
+ vextf c13, 3, a4 # a4=C21I_bc
+
+ vextf c14, 0, b1 # b1=C11I_ad
+ vextf c14, 1, b2 # b2=C11R_bd
+ vextf c14, 2, b3 # b3=C21I_ad
+ vextf c14, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 12 * SIZE(C1)
+ LD a2, 13 * SIZE(C1)
+ LD a3, 14 * SIZE(C1)
+ LD a4, 15 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 12 * SIZE(C1)
+ ST c01, 13 * SIZE(C1)
+ ST b6, 14 * SIZE(C1)
+ ST c02, 15 * SIZE(C1)
+
+#else
+
+ vextf c01, 0, a1 # a1=C11R_ac
+ vextf c01, 1, a2 # a2=C11I_bc
+ vextf c01, 2, a3 # a3=C21R_ac
+ vextf c01, 3, a4 # a4=C21I_bc
+
+ vextf c02, 0, b1 # b1=C11I_ad
+ vextf c02, 1, b2 # b2=C11R_bd
+ vextf c02, 2, b3 # b3=C21I_ad
+ vextf c02, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 0 * SIZE(C1)
+ ST c01, 1 * SIZE(C1)
+ ST b6, 2 * SIZE(C1)
+ ST c02, 3 * SIZE(C1)
+
+ vextf c05, 0, a1 # a1=C11R_ac
+ vextf c05, 1, a2 # a2=C11I_bc
+ vextf c05, 2, a3 # a3=C21R_ac
+ vextf c05, 3, a4 # a4=C21I_bc
+
+ vextf c06, 0, b1 # b1=C11I_ad
+ vextf c06, 1, b2 # b2=C11R_bd
+ vextf c06, 2, b3 # b3=C21I_ad
+ vextf c06, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 4 * SIZE(C1)
+ ST c01, 5 * SIZE(C1)
+ ST b6, 6 * SIZE(C1)
+ ST c02, 7 * SIZE(C1)
+
+ vextf c09, 0, a1 # a1=C11R_ac
+ vextf c09, 1, a2 # a2=C11I_bc
+ vextf c09, 2, a3 # a3=C21R_ac
+ vextf c09, 3, a4 # a4=C21I_bc
+
+ vextf c10, 0, b1 # b1=C11I_ad
+ vextf c10, 1, b2 # b2=C11R_bd
+ vextf c10, 2, b3 # b3=C21I_ad
+ vextf c10, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 8 * SIZE(C1)
+ ST c01, 9 * SIZE(C1)
+ ST b6, 10 * SIZE(C1)
+ ST c02, 11 * SIZE(C1)
+
+ vextf c13, 0, a1 # a1=C11R_ac
+ vextf c13, 1, a2 # a2=C11I_bc
+ vextf c13, 2, a3 # a3=C21R_ac
+ vextf c13, 3, a4 # a4=C21I_bc
+
+ vextf c14, 0, b1 # b1=C11I_ad
+ vextf c14, 1, b2 # b2=C11R_bd
+ vextf c14, 2, b3 # b3=C21I_ad
+ vextf c14, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 12 * SIZE(C1)
+ ST c01, 13 * SIZE(C1)
+ ST b6, 14 * SIZE(C1)
+ ST c02, 15 * SIZE(C1)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 8, TEMP
+#else
+ subl TEMP, 1, TEMP
+#endif
+
+ sll TEMP, 3 + ZBASE_SHIFT,L
+ sll TEMP, ZBASE_SHIFT,TEMP
+
+ addl AO, L, AO
+ addl BO, TEMP, BO
+#endif
+
+#ifdef LEFT
+ addl KK, 8, KK
+#endif
+#endif
+
+ jmp $L999
+
+
+ .align 4
+$L60:
+ and M, 4, I
+ ble I, $L70
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA))\
+ || (!defined(LEFT) && !defined(TRANSA))
+ mov B, BO
+#else
+ sll KK, 2 + ZBASE_SHIFT,L # mr=4
+ sll KK, ZBASE_SHIFT,TEMP # nr=1
+
+ addl AO, L, AO
+ addl B, TEMP, BO
+#endif
+
+ fillcs 0(C1)
+ fillcs 4*SIZE(C1)
+ fillcs 8*SIZE(C1)
+
+ vcpys $f31,$f31,c01 # Clear result regs
+ vcpys $f31,$f31,c02
+
+ vcpys $f31,$f31,c05
+ vcpys $f31,$f31,c06
+
+ LDDE b1, 0 * SIZE(BO) # B1R
+ LDDE b2, 1 * SIZE(BO) # B1I
+
+ VLD a1, 0 * SIZE(AO) # A1, A2
+ VLD a2, 4 * SIZE(AO) # A3, A4
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl K, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 4, TEMP # mr=4
+#else
+ addl KK, 1, TEMP # nr=1
+#endif
+ sra TEMP, 1, L
+ ble L, $L65
+
+#else
+
+ mov B, BO # Set B, (block A x panel Bj)
+ sra K, 1, L # Unroll K as 2
+
+ fillcs 0(C1)
+ fillcs 4*SIZE(C1)
+ fillcs 8*SIZE(C1)
+
+ vcpys $f31,$f31,c01 # Clear result regs
+ vcpys $f31,$f31,c02
+
+ vcpys $f31,$f31,c05
+ vcpys $f31,$f31,c06
+
+ LDDE b1, 0 * SIZE(BO) # B1R
+ LDDE b2, 1 * SIZE(BO) # B1I
+
+ VLD a1, 0 * SIZE(AO) # A1, A2
+ VLD a2, 4 * SIZE(AO) # A3, A4
+
+ ble L, $L65
+#endif
+
+ .align 4
+$L62:
+ VMAD a1,b1,c01,c01 # C11(ac,bc), C21(ac,bc)
+ VMAD a1,b2,c02,c02 # C11(ad,bd), C21(ad,bd)
+
+ LDDE b5, 2 * SIZE(BO) # next B1R
+ LDDE b6, 3 * SIZE(BO) # next B1I
+
+ addl BO, 4*SIZE, BO # BO+=2nr*2kr*2cpx*SIZE
+ VMAD a2,b1,c05,c05 # C31, C41
+ VMAD a2,b2,c06,c06 # C31, C41
+
+ fillcs 0(PREA)
+ VLD a5, 8 * SIZE(AO) # next A1, A2, a5==a0
+ VLD a6, 12 * SIZE(AO) # next A3, A4
+
+ subl L, 1, L #
+
+ addl AO, 16*SIZE, AO # AO+=4mr*2kr*2px*SIZE
+ VMAD a5,b5,c01,c01
+ VMAD a5,b6,c02,c02
+
+ addl PREA, 16*SIZE, PREA
+ LDDE b1, 0 * SIZE(BO)
+ LDDE b2, 1 * SIZE(BO)
+
+ fillcs 0(PREA)
+ VMAD a6,b5,c05,c05
+ VMAD a6,b6,c06,c06
+
+ VLD a1, 0 * SIZE(AO)
+ VLD a2, 4 * SIZE(AO)
+
+ addl PREA, 16*SIZE, PREA
+ bne L, $L62 # continue K
+
+$L65:
+ LD alpha_r, ALPHA_R # $f30==b8
+#ifndef TRMMKERNEL
+ blbc K, $L68 # if(K&1)
+#else
+ blbc TEMP, $L68
+#endif
+
+$L66:
+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE
+ VMAD a1,b1,c01,c01 # C11R C21R
+ VMAD a1,b2,c02,c02 # C11I C21I
+
+ addl BO, 2*SIZE, BO
+ VMAD a2,b1,c05,c05 # C31R C41R
+ VMAD a2,b2,c06,c06 # C31I C41I
+
+$L68: # Write back
+ LD alpha_i, ALPHA_I # $f29==b7
+#ifndef TRMMKERNEL
+ vextf c01, 0, a1 # a1=C11R_ac
+ vextf c01, 1, a2 # a2=C11I_bc
+ vextf c01, 2, a3 # a3=C21R_ac
+ vextf c01, 3, a4 # a4=C21I_bc
+
+ vextf c02, 0, b1 # b1=C11I_ad
+ vextf c02, 1, b2 # b2=C11R_bd
+ vextf c02, 2, b3 # b3=C21I_ad
+ vextf c02, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 0 * SIZE(C1)
+ LD a2, 1 * SIZE(C1)
+ LD a3, 2 * SIZE(C1)
+ LD a4, 3 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 0 * SIZE(C1)
+ ST c01, 1 * SIZE(C1)
+ ST b6, 2 * SIZE(C1)
+ ST c02, 3 * SIZE(C1)
+
+ vextf c05, 0, a1 # a1=C11R_ac
+ vextf c05, 1, a2 # a2=C11I_bc
+ vextf c05, 2, a3 # a3=C21R_ac
+ vextf c05, 3, a4 # a4=C21I_bc
+
+ vextf c06, 0, b1 # b1=C11I_ad
+ vextf c06, 1, b2 # b2=C11R_bd
+ vextf c06, 2, b3 # b3=C21I_ad
+ vextf c06, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ LD a1, 4 * SIZE(C1)
+ LD a2, 5 * SIZE(C1)
+ LD a3, 6 * SIZE(C1)
+ LD a4, 7 * SIZE(C1)
+
+ FMAD5 b5, alpha_r, a1, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, a3, a3
+ FMAD7 a7, alpha_r, a2, a2
+ FMAD7 a8, alpha_r, a4, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 4 * SIZE(C1)
+ ST c01, 5 * SIZE(C1)
+ ST b6, 6 * SIZE(C1)
+ ST c02, 7 * SIZE(C1)
+
+#else
+
+ vextf c01, 0, a1 # a1=C11R_ac
+ vextf c01, 1, a2 # a2=C11I_bc
+ vextf c01, 2, a3 # a3=C21R_ac
+ vextf c01, 3, a4 # a4=C21I_bc
+
+ vextf c02, 0, b1 # b1=C11I_ad
+ vextf c02, 1, b2 # b2=C11R_bd
+ vextf c02, 2, b3 # b3=C21I_ad
+ vextf c02, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 0 * SIZE(C1)
+ ST c01, 1 * SIZE(C1)
+ ST b6, 2 * SIZE(C1)
+ ST c02, 3 * SIZE(C1)
+
+ vextf c05, 0, a1 # a1=C11R_ac
+ vextf c05, 1, a2 # a2=C11I_bc
+ vextf c05, 2, a3 # a3=C21R_ac
+ vextf c05, 3, a4 # a4=C21I_bc
+
+ vextf c06, 0, b1 # b1=C11I_ad
+ vextf c06, 1, b2 # b2=C11R_bd
+ vextf c06, 2, b3 # b3=C21I_ad
+ vextf c06, 3, b4 # b4=C21R_bd
+
+ ADD1 a1, b2, b5 # ac '+' bd
+ ADD1 a3, b4, a6
+ ADD2 a2, b1, a7 # bc '+' ad
+ ADD2 a4, b3, a8
+
+ FMAD5 b5, alpha_r, $f31, a1 # a1=a5*alpha_r+a1
+ FMAD5 a6, alpha_r, $f31, a3
+ FMAD7 a7, alpha_r, $f31, a2
+ FMAD7 a8, alpha_r, $f31, a4
+
+ FMAD8 a7, alpha_i, a1, b4
+ FMAD8 a8, alpha_i, a3, b6
+ FMAD6 b5, alpha_i, a2, c01
+ FMAD6 a6, alpha_i, a4, c02
+
+ ST b4, 4 * SIZE(C1)
+ ST c01, 5 * SIZE(C1)
+ ST b6, 6 * SIZE(C1)
+ ST c02, 7 * SIZE(C1)
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl K, KK,TEMP
+#ifdef LEFT
+ subl TEMP, 4, TEMP # mr=4
+#else
+ subl TEMP, 1, TEMP # nr=1
+#endif
+
+ sll TEMP, 2 + ZBASE_SHIFT, L
+ sll TEMP, ZBASE_SHIFT,TEMP
+
+ addl AO, L, AO
+ addl BO,TEMP, BO
+#endif
+
+#ifdef LEFT
+ addl KK,4,KK
+#endif
+#endif
+
+ addl C1, 8*SIZE, C1
+
+
+ .align 4
+$L70:
+ and M, 2, I # I=M&2
+ ble I, $L80
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B, BO
+ nop
+#else
+ sll KK, 1 + ZBASE_SHIFT, L # mr=2
+ sll KK, ZBASE_SHIFT,TEMP # nr=1
+
+ addl AO, L, AO
+ addl B, TEMP, BO
+#endif
+
+ fillcs 0*SIZE(C1)
+ fillcs 4*SIZE(C1)
+
+ fclr c01
+ fclr c02 # CLEAR 8 register
+ fclr c03
+ fclr c04
+ fclr c05
+ fclr c06
+ fclr c07
+ fclr c08
+
+ LD b1, 0*SIZE(BO) # b1 real part
+ LD b2, 1*SIZE(BO) # b1 image part
+
+ LD a1, 0*SIZE(AO) # a1 real part
+ LD a2, 1*SIZE(AO) # a1 image part
+ LD a3, 2*SIZE(AO) # a2 real part
+ LD a4, 3*SIZE(AO) # a2 image part
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl K, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 2, TEMP # mr=2
+#else
+ addl KK, 1, TEMP # nr=1
+#endif
+ sra TEMP, 1, L
+ ble L, $L75
+
+#else
+ mov B, BO # Set B, (block A x panel Bj)
+ sra K, 1, L # Unroll K as 2
+
+ fillcs 0*SIZE(C1)
+ fillcs 4*SIZE(C1)
+
+ fclr c01
+ fclr c02 # CLEAR 8 register
+ fclr c03
+ fclr c04
+ fclr c05
+ fclr c06
+ fclr c07
+ fclr c08
+
+ LD b1, 0*SIZE(BO) # b1 real part
+ LD b2, 1*SIZE(BO) # b1 image part
+
+ LD a1, 0*SIZE(AO) # a1 real part
+ LD a2, 1*SIZE(AO) # a1 image part
+ LD a3, 2*SIZE(AO) # a2 real part
+ LD a4, 3*SIZE(AO) # a2 image part
+
+ ble L, $L75
+#endif
+
+ .align 4
+$L72:
+ MAD a1,b1,c01,c01 # C11 real part
+ MAD a1,b2,c02,c02 # C11 imag part
+
+ LD b5, 2 * SIZE(BO) # next B1R
+ LD b6, 3 * SIZE(BO) # next B1I
+
+ LD a5, 4 * SIZE(AO) # next A1-A4 real part
+ LD a6, 5 * SIZE(AO) # next A1-A4 image part
+ LD a7, 6 * SIZE(AO)
+ LD a8, 7 * SIZE(AO)
+
+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE
+ MAD a2,b1,c03,c03 # C11 image part
+ MAD a2,b2,c04,c04 # C11 real part
+
+ MAD a3,b1,c05,c05 # C12 real part
+ MAD a3,b2,c06,c06 # C12 imag part
+
+ MAD a4,b1,c07,c07 # C12 image part
+ MAD a4,b2,c08,c08 # C12 real part
+
+ subl L, 1, L #
+
+ addl AO, 8*SIZE, AO # AO+=4mr*1kr*2px*SIZE
+ MAD a5,b5,c01,c01
+ MAD a5,b6,c02,c02
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MAD a6,b5,c03,c03
+ MAD a6,b6,c04,c04
+
+ MAD a7,b5,c05,c05
+ MAD a7,b6,c06,c06
+
+ MAD a8,b5,c07,c07
+ MAD a8,b6,c08,c08
+
+ bne L, $L72 # continue K
+
+$L75:
+ LD alpha_r, ALPHA_R # $f30==b8
+#ifndef TRMMKERNEL
+ blbc K, $L78 # if(K&1)
+#else
+ blbc TEMP, $L78
+#endif
+
+$L76:
+ addl AO, 4*SIZE, AO # AO+=2mr*1kr*2px*SIZE
+ MAD a1,b1,c01,c01 # C11 real part
+ MAD a1,b2,c02,c02 # C11 imag part
+
+ addl BO, 4*SIZE, BO
+ MAD a2,b1,c03,c03 # C11 image part
+ MAD a2,b2,c04,c04 # C11 real part
+
+ MAD a3,b1,c05,c05 # C12 real part
+ MAD a3,b2,c06,c06 # C12 imag part
+
+ MAD a4,b1,c07,c07 # C12 image part
+ MAD a4,b2,c08,c08 # C12 real part
+
+
+
+$L78: # Write back
+ LD alpha_i, ALPHA_I # $f29==b7
+#ifndef TRMMKERNEL
+ ADD1 c01, c04, c01
+ ADD1 c05, c08, c05
+ ADD2 c03, c02, c02
+ ADD2 c07, c06, c06
+
+ LD b1, 0 * SIZE(C1)
+ LD b2, 1 * SIZE(C1)
+ LD b3, 2 * SIZE(C1)
+ LD b4, 3 * SIZE(C1)
+
+ FMAD5 c01, alpha_r, b1, b1
+ FMAD5 c05, alpha_r, b3, b3
+ FMAD7 c02, alpha_r, b2, b2
+ FMAD7 c06, alpha_r, b4, b4
+
+ FMAD8 c02, alpha_i, b1, b1
+ FMAD8 c06, alpha_i, b3, b3
+ FMAD6 c01, alpha_i, b2, b2
+ FMAD6 c05, alpha_i, b4, b4
+
+ ST b1, 0 * SIZE(C1)
+ ST b2, 1 * SIZE(C1)
+ ST b3, 2 * SIZE(C1)
+ ST b4, 3 * SIZE(C1)
+
+#else
+
+ ADD1 c01, c04, c01
+ ADD1 c05, c08, c05
+ ADD2 c03, c02, c02
+ ADD2 c07, c06, c06
+
+ FMAD5 c01, alpha_r, $f31, b1
+ FMAD5 c05, alpha_r, $f31, b3
+ FMAD7 c02, alpha_r, $f31, b2
+ FMAD7 c06, alpha_r, $f31, b4
+
+ FMAD8 c02, alpha_i, b1, b1
+ FMAD8 c06, alpha_i, b3, b3
+ FMAD6 c01, alpha_i, b2, b2
+ FMAD6 c05, alpha_i, b4, b4
+
+ ST b1, 0 * SIZE(C1)
+ ST b2, 1 * SIZE(C1)
+ ST b3, 2 * SIZE(C1)
+ ST b4, 3 * SIZE(C1)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 2, TEMP
+#else
+ subl TEMP, 1, TEMP
+#endif
+
+ sll TEMP, 1 + ZBASE_SHIFT, L
+ sll TEMP, ZBASE_SHIFT, TEMP
+
+ addl AO, L, AO
+ addl BO, TEMP, BO
+#endif
+
+#ifdef LEFT
+ addl KK, 2, KK
+#endif
+#endif
+
+ addl C1, 4*SIZE, C1
+
+
+ .align 4
+$L80:
+ and M, 1, I # I=M&1
+ ble I, $L999
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mov B, BO
+ nop
+#else
+ sll KK, ZBASE_SHIFT, L # mr=1
+ sll KK, ZBASE_SHIFT,TEMP # nr=1
+
+ addl AO, L, AO
+ addl B, TEMP, BO
+#endif
+
+ fillcs 0*SIZE(C1)
+
+ fclr c01 # CLEAR 8 register
+ fclr c02
+ fclr c03
+ fclr c04
+
+ LD b1, 0*SIZE(BO) # b1 real part
+ LD b2, 1*SIZE(BO) # b1 image part
+
+ LD a1, 0*SIZE(AO) # a1 real part
+ LD a2, 1*SIZE(AO) # a1 image part
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ subl K, KK, TEMP
+#elif defined(LEFT)
+ addl KK, 1, TEMP # mr=1
+#else
+ addl KK, 1, TEMP # nr=1
+#endif
+ sra TEMP, 1, L
+ ble L, $L85
+
+#else
+ mov B, BO # Set B, (block A x panel Bj)
+ sra K, 1, L # Unroll K as 2
+
+ fillcs 0*SIZE(C1)
+
+ fclr c01 # CLEAR 8 register
+ fclr c02
+ fclr c03
+ fclr c04
+
+ LD b1, 0*SIZE(BO) # b1 real part
+ LD b2, 1*SIZE(BO) # b1 image part
+
+ LD a1, 0*SIZE(AO) # a1 real part
+ LD a2, 1*SIZE(AO) # a1 image part
+
+ ble L, $L85
+#endif
+
+ .align 4
+$L82:
+ MAD a1,b1,c01,c01 # C11 real part
+ MAD a1,b2,c02,c02 # C11 imag part
+
+ LD b5, 2 * SIZE(BO) # next B1R
+ LD b6, 3 * SIZE(BO) # next B1I
+
+ LD a5, 2 * SIZE(AO) # next A1-A4 real part
+ LD a6, 3 * SIZE(AO) # next A1-A4 image part
+
+ addl BO, 4*SIZE, BO # BO+=1nr*2kr*2cpx*SIZE
+ MAD a2,b1,c03,c03 # C11 image part
+ MAD a2,b2,c04,c04 # C11 real part
+
+ subl L, 1, L #
+
+ addl AO, 4*SIZE, AO # AO+=1mr*2kr*2px*SIZE
+ MAD a5,b5,c01,c01
+ MAD a5,b6,c02,c02
+
+ LD b1, 0 * SIZE(BO)
+ LD b2, 1 * SIZE(BO)
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MAD a6,b5,c03,c03
+ MAD a6,b6,c04,c04
+
+ bne L, $L82 # continue K
+
+$L85:
+ LD alpha_r, ALPHA_R # $f30==b8
+#ifndef TRMMKERNEL
+ blbc K, $L88 # if(K&1)
+#else
+ blbc TEMP, $L88
+#endif
+
+$L86:
+ addl AO, 2*SIZE, AO # AO+=8mr*1kr*2px*SIZE
+ MAD a1,b1,c01,c01 # C11 real part
+ MAD a1,b2,c02,c02 # C11 imag part
+
+ addl BO, 2*SIZE, BO
+ MAD a2,b1,c03,c03 # C11 image part
+ MAD a2,b2,c04,c04 # C11 real part
+
+$L88: # Write back
+ LD alpha_i, ALPHA_I # $f29==b7
+#ifndef TRMMKERNEL
+ ADD1 c01, c04, c01
+ ADD2 c03, c02, c02
+
+ LD b1, 0 * SIZE(C1)
+ LD b2, 1 * SIZE(C1)
+
+ FMAD5 c01, alpha_r, b1, b1
+ FMAD7 c02, alpha_r, b2, b2
+ FMAD8 c02, alpha_i, b1, b1
+ FMAD6 c01, alpha_i, b2, b2
+
+ ST b1, 0 * SIZE(C1)
+ ST b2, 1 * SIZE(C1)
+
+#else
+
+ ADD1 c01, c04, c01
+ ADD2 c03, c02, c02
+
+ FMAD5 c01, alpha_r, $f31, b1
+ FMAD7 c02, alpha_r, $f31, b2
+
+ FMAD8 c02, alpha_i, b1, b1
+ FMAD6 c01, alpha_i, b2, b2
+
+ ST b1, 0 * SIZE(C1)
+ ST b2, 1 * SIZE(C1)
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+ (!defined(LEFT) && !defined(TRANSA))
+ subl K, KK, TEMP
+#ifdef LEFT
+ subl TEMP, 1, TEMP
+#else
+ subl TEMP, 1, TEMP
+#endif
+
+ sll TEMP, ZBASE_SHIFT, L
+ sll TEMP, ZBASE_SHIFT, TEMP
+
+ addl AO, L, AO
+ addl BO, TEMP,BO
+#endif
+
+#ifdef LEFT
+ addl KK, 1, KK
+#endif
+#endif
+
+ addl C1, 2*SIZE, C1
+
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl $9, 80($sp)
+ ldl $10,88($sp)
+ ldl $11,96($sp)
+ ldl $12,104($sp)
+ ldl $13,112($sp)
+ ldl $14,120($sp)
+
+ clr $0
+
+ ldi $sp, STACKSIZE($sp)
+ ret $31,($26),1 #
+
+ EPILOGUE
diff --git a/kernel/sw_64/zgemv_n.S b/kernel/sw_64/zgemv_n.S
new file mode 100644
index 0000000..03d71ee
--- /dev/null
+++ b/kernel/sw_64/zgemv_n.S
@@ -0,0 +1,1040 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define STACKSIZE 64
+#define PREFETCHSIZE 32
+
+#define M $16
+#define N $17
+#define A $21
+#define LDA $18
+
+#define X $19
+#define INCX $20
+#define Y $22
+#define INCY $23
+
+#define BUFFER $24
+
+#define I $25
+#define J $27
+
+#define Y1 $4
+#define A1 $5
+#define A2 $6
+
+#define alpha_r $f19
+#define alpha_i $f20
+
+#define alpha1 $f0
+#define alpha2 $f1
+#define alpha3 $f10
+#define alpha4 $f11
+
+#define y0 $f12
+#define y1 $f13
+#define y2 $f14
+#define y3 $f15
+
+#define y4 $f16
+#define y5 $f17
+#define y6 $f18
+#define y7 $f21
+
+#define a0 $f22
+#define a1 $f23
+#define a2 $f24
+#define a3 $f25
+#define a4 $f26
+#define a5 $f27
+#define a6 $f28
+#define a7 $f29
+
+#define t0 $f2
+#define t1 $f3
+#define t2 $f4
+#define t3 $f5
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#elif defined(CONJ) && !defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#elif !defined(CONJ) && defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 SUB
+#define ADD4 SUB
+#endif
+
+ PROLOGUE
+
+ ldi $sp, -STACKSIZE($sp)
+ ldl LDA, 0 + STACKSIZE($sp)
+ ldl X, 8 + STACKSIZE($sp)
+ ldl INCX, 16 + STACKSIZE($sp)
+ ldl Y, 24 + STACKSIZE($sp)
+ ldl INCY, 32 + STACKSIZE($sp)
+ ldl BUFFER, 40 + STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ PROFCODE
+
+ cmple M, 0, $0
+ sll INCX, ZBASE_SHIFT, INCX
+ cmple N, 0, $1
+ sll INCY, ZBASE_SHIFT, INCY
+
+ or $0, $1, $0
+ bne $0, $L999
+
+ cmpeq INCY, 2 * SIZE, $0
+ sll LDA, ZBASE_SHIFT,LDA
+ bne $0, $L10
+
+ mov BUFFER, Y1
+
+ mov Y, BUFFER
+ mov Y1, Y
+
+ sra M, 2, I
+ ble I, $L05
+ .align 4
+
+$L02:
+ ST $f31, 0 * SIZE(Y1)
+ ST $f31, 1 * SIZE(Y1)
+ ST $f31, 2 * SIZE(Y1)
+ ST $f31, 3 * SIZE(Y1)
+ ST $f31, 4 * SIZE(Y1)
+ ST $f31, 5 * SIZE(Y1)
+ ST $f31, 6 * SIZE(Y1)
+ ST $f31, 7 * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ ldi I, -1(I)
+ bgt I, $L02
+ .align 4
+
+$L05:
+ and M, 3, I
+ ble I, $L10
+ .align 4
+
+$L06:
+ ST $f31, 0 * SIZE(Y1)
+ ST $f31, 1 * SIZE(Y1)
+ addl Y1, 2 * SIZE, Y1
+
+ ldi I, -1(I)
+ bgt I, $L06
+ .align 4
+
+$L10:
+ sra N, 1, J
+ ble J, $L20
+ .align 4
+
+$L11:
+ LD alpha1, 0 * SIZE(X)
+ LD alpha2, 1 * SIZE(X)
+ addl X, INCX, X
+ LD alpha3, 0 * SIZE(X)
+ LD alpha4, 1 * SIZE(X)
+ addl X, INCX, X
+
+ MUL alpha_r, alpha1, y0
+ MUL alpha_r, alpha2, y1
+ MUL alpha_r, alpha3, y2
+ MUL alpha_r, alpha4, y3
+
+ MUL alpha_i, alpha2, t0
+ mov A, A1
+ MUL alpha_i, alpha1, t1
+ addl A, LDA, A2
+ MUL alpha_i, alpha4, t2
+ addl A2, LDA, A
+ MUL alpha_i, alpha3, t3
+ mov Y, Y1
+
+#ifndef XCONJ
+ SUB y0, t0, alpha1
+ ADD y1, t1, alpha2
+ SUB y2, t2, alpha3
+ ADD y3, t3, alpha4
+#else
+ ADD y0, t0, alpha1
+ SUB y1, t1, alpha2
+ ADD y2, t2, alpha3
+ SUB y3, t3, alpha4
+#endif
+
+ fillcs 4 * SIZE(X)
+
+ sra M, 2, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ MUL alpha1, a0, t0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ LD y1, 1 * SIZE(Y1)
+
+ MUL alpha1, a2, t2
+ LD y2, 2 * SIZE(Y1)
+ MUL alpha1, a3, t3
+ LD y3, 3 * SIZE(Y1)
+
+ ADD1 y0, t0, $f6
+ unop
+ MUL alpha3, a4, t0
+ LD y4, 4 * SIZE(Y1)
+
+ ADD2 y1, t1, $f7
+ unop
+ MUL alpha3, a5, t1
+ LD y5, 5 * SIZE(Y1)
+
+ ADD1 y2, t2, $f8
+ unop
+ MUL alpha3, a6, t2
+ LD y6, 6 * SIZE(Y1)
+
+ ADD2 y3, t3, $f9
+ unop
+ MUL alpha3, a7, t3
+ LD y7, 7 * SIZE(Y1)
+
+ ADD1 $f6, t0, y0
+ unop
+ MUL alpha2, a1, t0
+ LD a1, 5 * SIZE(A1)
+
+ ADD2 $f7, t1, y1
+ unop
+ MUL alpha2, a0, t1
+ LD a0, 4 * SIZE(A1)
+
+ ADD1 $f8, t2, y2
+ unop
+ MUL alpha2, a3, t2
+ LD a3, 7 * SIZE(A1)
+
+ ADD2 $f9, t3, y3
+ unop
+ MUL alpha2, a2, t3
+ LD a2, 6 * SIZE(A1)
+
+ ADD3 y0, t0, $f6
+ unop
+ MUL alpha4, a5, t0
+ LD a5, 5 * SIZE(A2)
+
+ ADD4 y1, t1, $f7
+ unop
+ MUL alpha4, a4, t1
+ LD a4, 4 * SIZE(A2)
+
+ ADD3 y2, t2, $f8
+ unop
+ MUL alpha4, a7, t2
+ LD a7, 7 * SIZE(A2)
+
+ ADD4 y3, t3, $f9
+ unop
+ MUL alpha4, a6, t3
+ LD a6, 6 * SIZE(A2)
+
+ ADD3 $f6, t0, y0
+ MUL alpha1, a0, t0
+ ADD4 $f7, t1, y1
+ MUL alpha1, a1, t1
+
+ ADD3 $f8, t2, y2
+ unop
+ MUL alpha1, a2, t2
+ unop
+
+ ADD4 $f9, t3, y3
+ ldi I, -1(I)
+ MUL alpha1, a3, t3
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD1 y4, t0, $f6
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha3, a4, t0
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+
+ ADD2 y5, t1, $f7
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha3, a5, t1
+ ldi I, -1(I)
+
+ ADD1 y6, t2, $f8
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha3, a6, t2
+ unop
+
+ ADD2 y7, t3, $f9
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha3, a7, t3
+ unop
+
+ ADD1 $f6, t0, y4
+ unop
+ MUL alpha2, a1, t0
+ LD a1, 9 * SIZE(A1)
+
+ ADD2 $f7, t1, y5
+ unop
+ MUL alpha2, a0, t1
+ LD a0, 8 * SIZE(A1)
+
+ ADD1 $f8, t2, y6
+ unop
+ MUL alpha2, a3, t2
+ LD a3, 11 * SIZE(A1)
+
+ ADD2 $f9, t3, y7
+ unop
+ MUL alpha2, a2, t3
+ LD a2, 10 * SIZE(A1)
+
+ ADD3 y4, t0, $f6
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
+ MUL alpha4, a5, t0
+ LD a5, 9 * SIZE(A2)
+
+ ADD4 y5, t1, $f7
+ unop
+ MUL alpha4, a4, t1
+ LD a4, 8 * SIZE(A2)
+
+ ADD3 y6, t2, $f8
+ unop
+ MUL alpha4, a7, t2
+ LD a7, 11 * SIZE(A2)
+
+ ADD4 y7, t3, $f9
+ unop
+ MUL alpha4, a6, t3
+ LD a6, 10 * SIZE(A2)
+
+ ADD3 $f6, t0, y4
+ unop
+ MUL alpha1, a0, t0
+ LD y0, 8 * SIZE(Y1)
+
+ ADD4 $f7, t1, y5
+ unop
+ MUL alpha1, a1, t1
+ LD y1, 9 * SIZE(Y1)
+
+ ADD3 $f8, t2, y6
+ unop
+ MUL alpha1, a2, t2
+ LD y2, 10 * SIZE(Y1)
+
+ ADD4 $f9, t3, y7
+ unop
+ MUL alpha1, a3, t3
+ LD y3, 11 * SIZE(Y1)
+
+ ADD1 y0, t0, $f6
+ ST y4, 4 * SIZE(Y1)
+ MUL alpha3, a4, t0
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
+
+ ADD2 y1, t1, $f7
+ ST y5, 5 * SIZE(Y1)
+ MUL alpha3, a5, t1
+ unop
+
+ ADD1 y2, t2, $f8
+ ST y6, 6 * SIZE(Y1)
+ MUL alpha3, a6, t2
+ unop
+
+ ADD2 y3, t3, $f9
+ ST y7, 7 * SIZE(Y1)
+ MUL alpha3, a7, t3
+ ldi Y1, 8 * SIZE(Y1)
+
+ ADD1 $f6, t0, y0
+ unop
+ MUL alpha2, a1, t0
+ LD a1, 13 * SIZE(A1)
+
+ ADD2 $f7, t1, y1
+ unop
+ MUL alpha2, a0, t1
+ LD a0, 12 * SIZE(A1)
+
+ ADD1 $f8, t2, y2
+ unop
+ MUL alpha2, a3, t2
+ LD a3, 15 * SIZE(A1)
+
+ ADD2 $f9, t3, y3
+ unop
+ MUL alpha2, a2, t3
+ LD a2, 14 * SIZE(A1)
+
+ ADD3 y0, t0, $f6
+ unop
+ MUL alpha4, a5, t0
+ LD a5, 13 * SIZE(A2)
+
+ ADD4 y1, t1, $f7
+ unop
+ MUL alpha4, a4, t1
+ LD a4, 12 * SIZE(A2)
+
+ ADD3 y2, t2, $f8
+ unop
+ MUL alpha4, a7, t2
+ LD a7, 15 * SIZE(A2)
+
+ ADD4 y3, t3, $f9
+ unop
+ MUL alpha4, a6, t3
+ LD a6, 14 * SIZE(A2)
+
+ ADD3 $f6, t0, y0
+ unop
+ MUL alpha1, a0, t0
+ LD y4, 4 * SIZE(Y1)
+
+ ADD4 $f7, t1, y1
+ ldi A2, 8 * SIZE(A2)
+ MUL alpha1, a1, t1
+ LD y5, 5 * SIZE(Y1)
+
+ ADD3 $f8, t2, y2
+ ldi A1, 8 * SIZE(A1)
+ MUL alpha1, a2, t2
+ LD y6, 6 * SIZE(Y1)
+
+ ADD4 $f9, t3, y3
+ MUL alpha1, a3, t3
+ LD y7, 7 * SIZE(Y1)
+ bgt I, $L12
+ .align 4
+
+$L13:
+ ADD1 y4, t0, $f6
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha3, a4, t0
+ unop
+
+ ADD2 y5, t1, $f7
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha3, a5, t1
+ unop
+
+ ADD1 y6, t2, $f8
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha3, a6, t2
+ unop
+
+ ADD2 y7, t3, $f9
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha3, a7, t3
+ unop
+
+ ADD1 $f6, t0, y4
+ MUL alpha2, a1, t0
+ ADD2 $f7, t1, y5
+ MUL alpha2, a0, t1
+
+ ADD1 $f8, t2, y6
+ MUL alpha2, a3, t2
+ ADD2 $f9, t3, y7
+ MUL alpha2, a2, t3
+
+ ADD3 y4, t0, $f6
+ MUL alpha4, a5, t0
+ ADD4 y5, t1, $f7
+ MUL alpha4, a4, t1
+
+ ADD3 y6, t2, $f8
+ MUL alpha4, a7, t2
+ ADD4 y7, t3, $f9
+ MUL alpha4, a6, t3
+
+ ADD3 $f6, t0, y4
+ ADD4 $f7, t1, y5
+ ADD3 $f8, t2, y6
+ ADD4 $f9, t3, y7
+
+ ST y4, 4 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y5, 5 * SIZE(Y1)
+ ldi A2, 8 * SIZE(A2)
+
+ ST y6, 6 * SIZE(Y1)
+ unop
+ ST y7, 7 * SIZE(Y1)
+ ldi Y1, 8 * SIZE(Y1)
+ .align 4
+
+$L15:
+ and M, 2, I
+ ble I, $L17
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ MUL alpha1, a0, t0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ LD y1, 1 * SIZE(Y1)
+ MUL alpha1, a2, t2
+ LD y2, 2 * SIZE(Y1)
+ MUL alpha1, a3, t3
+ LD y3, 3 * SIZE(Y1)
+
+ ADD1 y0, t0, $f6
+ MUL alpha3, a4, t0
+ ADD2 y1, t1, $f7
+ MUL alpha3, a5, t1
+ ADD1 y2, t2, $f8
+ MUL alpha3, a6, t2
+ ADD2 y3, t3, $f9
+ MUL alpha3, a7, t3
+
+ ADD1 $f6, t0, y0
+ MUL alpha2, a1, t0
+ ADD2 $f7, t1, y1
+ MUL alpha2, a0, t1
+
+ ADD1 $f8, t2, y2
+ MUL alpha2, a3, t2
+ ADD2 $f9, t3, y3
+ MUL alpha2, a2, t3
+
+ ADD3 y0, t0, $f6
+ MUL alpha4, a5, t0
+ ADD4 y1, t1, $f7
+ MUL alpha4, a4, t1
+
+ ADD3 y2, t2, $f8
+ MUL alpha4, a7, t2
+ ADD4 y3, t3, $f9
+ MUL alpha4, a6, t3
+
+ ADD3 $f6, t0, y0
+ ADD4 $f7, t1, y1
+ ADD3 $f8, t2, y2
+ ADD4 $f9, t3, y3
+
+ ST y0, 0 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y1, 1 * SIZE(Y1)
+ ldi A2, 4 * SIZE(A2)
+
+ ST y2, 2 * SIZE(Y1)
+ unop
+ ST y3, 3 * SIZE(Y1)
+ ldi Y1, 4 * SIZE(Y1)
+ .align 4
+
+$L17:
+ blbc M, $L18
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+
+ MUL alpha1, a0, t0
+ MUL alpha1, a1, t1
+
+ ADD1 y0, t0, $f6
+ MUL alpha3, a2, t0
+ ADD2 y1, t1, $f7
+ MUL alpha3, a3, t1
+
+ ADD1 $f6, t0, y0
+ MUL alpha2, a1, t0
+ ADD2 $f7, t1, y1
+ MUL alpha2, a0, t1
+
+ ADD3 y0, t0, $f6
+ MUL alpha4, a3, t0
+ ADD4 y1, t1, $f7
+ MUL alpha4, a2, t1
+
+ ADD3 $f6, t0, y0
+ ADD4 $f7, t1, y1
+
+ ST y0, 0 * SIZE(Y1)
+ ST y1, 1 * SIZE(Y1)
+ .align 4
+
+$L18:
+ ldi J, -1(J)
+ bgt J, $L11
+ .align 4
+
+$L20:
+ blbc N, $L990
+
+ LD alpha1, 0 * SIZE(X)
+ LD alpha2, 1 * SIZE(X)
+
+ MUL alpha_r, alpha1, y0
+ MUL alpha_r, alpha2, y1
+
+ MUL alpha_i, alpha2, t0
+ mov A, A1
+ MUL alpha_i, alpha1, t1
+ mov Y, Y1
+
+#ifndef XCONJ
+ SUB y0, t0, alpha1
+ ADD y1, t1, alpha2
+#else
+ ADD y0, t0, alpha1
+ SUB y1, t1, alpha2
+#endif
+
+ sra M, 2, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ MUL alpha1, a0, t0
+ LD a4, 4 * SIZE(A1)
+ MUL alpha1, a1, t1
+ LD a5, 5 * SIZE(A1)
+ MUL alpha1, a2, t2
+ LD a6, 6 * SIZE(A1)
+ MUL alpha1, a3, t3
+ LD a7, 7 * SIZE(A1)
+
+ ADD1 y0, t0, $f6
+ unop
+ MUL alpha2, a1, t0
+ LD a1, 9 * SIZE(A1)
+
+ ADD2 y1, t1, $f7
+ unop
+ MUL alpha2, a0, t1
+ LD a0, 8 * SIZE(A1)
+
+ ADD1 y2, t2, $f8
+ unop
+ MUL alpha2, a3, t2
+ LD a3, 11 * SIZE(A1)
+
+ ADD2 y3, t3, $f9
+ unop
+ MUL alpha2, a2, t3
+ LD a2, 10 * SIZE(A1)
+
+ ADD3 $f6, t0, y0
+ unop
+ LD y4, 4 * SIZE(Y1)
+ MUL alpha1, a4, t0
+
+ ADD4 $f7, t1, y1
+ unop
+ LD y5, 5 * SIZE(Y1)
+ MUL alpha1, a5, t1
+
+ ADD3 $f8, t2, y2
+ LD y6, 6 * SIZE(Y1)
+ MUL alpha1, a6, t2
+ ldi I, -1(I)
+
+ ADD4 $f9, t3, y3
+ LD y7, 7 * SIZE(Y1)
+ MUL alpha1, a7, t3
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD1 y4, t0, $f6
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a5, t0
+ LD a5, 13 * SIZE(A1)
+
+ ADD2 y5, t1, $f7
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a4, t1
+ LD a4, 12 * SIZE(A1)
+
+ ADD1 y6, t2, $f8
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a7, t2
+ LD a7, 15 * SIZE(A1)
+
+ ADD2 y7, t3, $f9
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a6, t3
+ LD a6, 14 * SIZE(A1)
+
+ ADD3 $f6, t0, y4
+ LD y0, 8 * SIZE(Y1)
+ MUL alpha1, a0, t0
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+
+ ADD4 $f7, t1, y5
+ LD y1, 9 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ ldi I, -1(I)
+
+ ADD3 $f8, t2, y6
+ LD y2, 10 * SIZE(Y1)
+ MUL alpha1, a2, t2
+ unop
+
+ ADD4 $f9, t3, y7
+ LD y3, 11 * SIZE(Y1)
+ MUL alpha1, a3, t3
+ unop
+
+ ADD1 y0, t0, $f6
+ ST y4, 4 * SIZE(Y1)
+ MUL alpha2, a1, t0
+ LD a1, 17 * SIZE(A1)
+
+ ADD2 y1, t1, $f7
+ ST y5, 5 * SIZE(Y1)
+ MUL alpha2, a0, t1
+ LD a0, 16 * SIZE(A1)
+
+ ADD1 y2, t2, $f8
+ ST y6, 6 * SIZE(Y1)
+ MUL alpha2, a3, t2
+ LD a3, 19 * SIZE(A1)
+
+ ADD2 y3, t3, $f9
+ ST y7, 7 * SIZE(Y1)
+ MUL alpha2, a2, t3
+ LD a2, 18 * SIZE(A1)
+
+ ADD3 $f6, t0, y0
+ LD y4, 12 * SIZE(Y1)
+ MUL alpha1, a4, t0
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
+
+ ADD4 $f7, t1, y1
+ LD y5, 13 * SIZE(Y1)
+ MUL alpha1, a5, t1
+ ldi A1, 8 * SIZE(A1)
+
+ ADD3 $f8, t2, y2
+ LD y6, 14 * SIZE(Y1)
+ MUL alpha1, a6, t2
+ ldi Y1, 8 * SIZE(Y1)
+
+ ADD4 $f9, t3, y3
+ LD y7, 7 * SIZE(Y1)
+ MUL alpha1, a7, t3
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD1 y4, t0, $f6
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a5, t0
+ unop
+
+ ADD2 y5, t1, $f7
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a4, t1
+ unop
+
+ ADD1 y6, t2, $f8
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a7, t2
+ unop
+
+ ADD2 y7, t3, $f9
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a6, t3
+ unop
+
+ ADD3 $f6, t0, y4
+ ADD4 $f7, t1, y5
+ ADD3 $f8, t2, y6
+ ADD4 $f9, t3, y7
+
+ ST y4, 4 * SIZE(Y1)
+ unop
+ ST y5, 5 * SIZE(Y1)
+ unop
+
+ ST y6, 6 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y7, 7 * SIZE(Y1)
+ ldi Y1, 8 * SIZE(Y1)
+ .align 4
+
+$L25:
+ and M, 2, I
+ ble I, $L27
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ MUL alpha1, a0, t0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ LD y1, 1 * SIZE(Y1)
+ MUL alpha1, a2, t2
+ LD y2, 2 * SIZE(Y1)
+ MUL alpha1, a3, t3
+ LD y3, 3 * SIZE(Y1)
+
+ ADD1 y0, t0, $f6
+ MUL alpha2, a1, t0
+ ADD2 y1, t1, $f7
+ MUL alpha2, a0, t1
+ ADD1 y2, t2, $f8
+ MUL alpha2, a3, t2
+ ADD2 y3, t3, $f9
+ MUL alpha2, a2, t3
+
+ ADD3 $f6, t0, y0
+ ADD4 $f7, t1, y1
+ ADD3 $f8, t2, y2
+ ADD4 $f9, t3, y3
+
+ ST y0, 0 * SIZE(Y1)
+ ST y1, 1 * SIZE(Y1)
+
+ ST y2, 2 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y3, 3 * SIZE(Y1)
+ ldi Y1, 4 * SIZE(Y1)
+ .align 4
+
+$L27:
+ blbc M, $L990
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+
+ MUL alpha1, a0, t0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ LD y1, 1 * SIZE(Y1)
+
+ ADD1 y0, t0, $f6
+ MUL alpha2, a1, t0
+ ADD2 y1, t1, $f7
+ MUL alpha2, a0, t1
+
+ ADD3 $f6, t0, y0
+ ADD4 $f7, t1, y1
+
+ ST y0, 0 * SIZE(Y1)
+ ST y1, 1 * SIZE(Y1)
+ .align 4
+
+$L990:
+ cmpeq INCY, 2 * SIZE, $0
+ bne $0, $L999
+
+ mov BUFFER, Y1
+
+ sra M, 2, I
+ ble I, $L995
+ .align 4
+
+$L992:
+ LD a0, 0 * SIZE(BUFFER)
+ LD a1, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a2, 0 * SIZE(BUFFER)
+ LD a3, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y0, 0 * SIZE(Y)
+ LD y1, 1 * SIZE(Y)
+ LD y2, 2 * SIZE(Y)
+ LD y3, 3 * SIZE(Y)
+
+ LD a4, 0 * SIZE(BUFFER)
+ LD a5, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a6, 0 * SIZE(BUFFER)
+ LD a7, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y4, 4 * SIZE(Y)
+ LD y5, 5 * SIZE(Y)
+ LD y6, 6 * SIZE(Y)
+ LD y7, 7 * SIZE(Y)
+
+ ADD a0, y0, $f6
+ ADD a1, y1, $f7
+ ADD a2, y2, $f8
+ ADD a3, y3, $f9
+
+ fmov $f6, a0
+ fmov $f7, a1
+ fmov $f8, a2
+ fmov $f9, a3
+
+ ST a0, 0 * SIZE(Y1)
+ ADD a4, y4, $f6
+ ST a1, 1 * SIZE(Y1)
+ ADD a5, y5, $f7
+ addl Y1, INCY, Y1
+
+ ST a2, 0 * SIZE(Y1)
+ ADD a6, y6, $f8
+ ST a3, 1 * SIZE(Y1)
+ ADD a7, y7, $f9
+ addl Y1, INCY, Y1
+
+ fmov $f6, a4
+ fmov $f7, a5
+ fmov $f8, a6
+ fmov $f9, a7
+
+ ST a4, 0 * SIZE(Y1)
+ ST a5, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a6, 0 * SIZE(Y1)
+ ST a7, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi I, -1(I)
+ ldi Y, 8 * SIZE(Y)
+ bgt I, $L992
+ .align 4
+
+$L995:
+ and M, 3, I
+ ble I, $L999
+ .align 4
+
+$L996:
+ LD a0, 0 * SIZE(BUFFER)
+ LD a1, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y0, 0 * SIZE(Y)
+ LD y1, 1 * SIZE(Y)
+ ldi Y, 2 * SIZE(Y)
+
+ ADD a0, y0, $f6
+ ADD a1, y1, $f7
+
+ fmov $f6, a0
+ fmov $f7, a1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi I, -1(I)
+ bgt I, $L996
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zgemv_n.S.bak b/kernel/sw_64/zgemv_n.S.bak
new file mode 100644
index 0000000..3dd482e
--- /dev/null
+++ b/kernel/sw_64/zgemv_n.S.bak
@@ -0,0 +1,1027 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define STACKSIZE 64
+#define PREFETCHSIZE 32
+
+#define M $16
+#define N $17
+#define A $21
+#define LDA $18
+
+#define X $19
+#define INCX $20
+#define Y $22
+#define INCY $23
+
+#define BUFFER $24
+
+#define I $25
+#define J $27
+
+#define Y1 $4
+#define A1 $5
+#define A2 $6
+
+#define alpha_r $f19
+#define alpha_i $f20
+
+#define alpha1 $f0
+#define alpha2 $f1
+#define alpha3 $f10
+#define alpha4 $f11
+
+#define y0 $f12
+#define y1 $f13
+#define y2 $f14
+#define y3 $f15
+
+#define y4 $f16
+#define y5 $f17
+#define y6 $f18
+#define y7 $f21
+
+#define a0 $f22
+#define a1 $f23
+#define a2 $f24
+#define a3 $f25
+#define a4 $f26
+#define a5 $f27
+#define a6 $f28
+#define a7 $f29
+
+#define t0 $f2
+#define t1 $f3
+#define t2 $f4
+#define t3 $f5
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#elif defined(CONJ) && !defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#elif !defined(CONJ) && defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 SUB
+#define ADD4 SUB
+#endif
+
+ PROLOGUE
+
+ ldi $sp, -STACKSIZE($sp)
+ ldl LDA, 0 + STACKSIZE($sp)
+ ldl X, 8 + STACKSIZE($sp)
+ ldl INCX, 16 + STACKSIZE($sp)
+ ldl Y, 24 + STACKSIZE($sp)
+ ldl INCY, 32 + STACKSIZE($sp)
+ ldl BUFFER, 40 + STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ PROFCODE
+
+ cmple M, 0, $0
+ sll INCX, ZBASE_SHIFT, INCX
+ cmple N, 0, $1
+ sll INCY, ZBASE_SHIFT, INCY
+
+ or $0, $1, $0
+ bne $0, $L999
+
+ cmpeq INCY, 2 * SIZE, $0
+ sll LDA, ZBASE_SHIFT,LDA
+ bne $0, $L10
+
+ mov BUFFER, Y1
+
+ mov Y, BUFFER
+ mov Y1, Y
+
+ sra M, 2, I
+ ble I, $L05
+ .align 4
+
+$L02:
+ ST $f31, 0 * SIZE(Y1)
+ ST $f31, 1 * SIZE(Y1)
+ ST $f31, 2 * SIZE(Y1)
+ ST $f31, 3 * SIZE(Y1)
+ ST $f31, 4 * SIZE(Y1)
+ ST $f31, 5 * SIZE(Y1)
+ ST $f31, 6 * SIZE(Y1)
+ ST $f31, 7 * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ ldi I, -1(I)
+ bgt I, $L02
+ .align 4
+
+$L05:
+ and M, 3, I
+ ble I, $L10
+ .align 4
+
+$L06:
+ ST $f31, 0 * SIZE(Y1)
+ ST $f31, 1 * SIZE(Y1)
+ addl Y1, 2 * SIZE, Y1
+
+ ldi I, -1(I)
+ bgt I, $L06
+ .align 4
+
+$L10:
+ sra N, 1, J
+ ble J, $L20
+ .align 4
+
+$L11:
+ LD alpha1, 0 * SIZE(X)
+ LD alpha2, 1 * SIZE(X)
+ addl X, INCX, X
+ LD alpha3, 0 * SIZE(X)
+ LD alpha4, 1 * SIZE(X)
+ addl X, INCX, X
+
+ MUL alpha_r, alpha1, y0
+ MUL alpha_r, alpha2, y1
+ MUL alpha_r, alpha3, y2
+ MUL alpha_r, alpha4, y3
+
+ MUL alpha_i, alpha2, t0
+ mov A, A1
+ MUL alpha_i, alpha1, t1
+ addl A, LDA, A2
+ MUL alpha_i, alpha4, t2
+ addl A2, LDA, A
+ MUL alpha_i, alpha3, t3
+ mov Y, Y1
+
+#ifndef XCONJ
+ SUB y0, t0, alpha1
+ ADD y1, t1, alpha2
+ SUB y2, t2, alpha3
+ ADD y3, t3, alpha4
+#else
+ ADD y0, t0, alpha1
+ SUB y1, t1, alpha2
+ ADD y2, t2, alpha3
+ SUB y3, t3, alpha4
+#endif
+
+ fillcs 4 * SIZE(X)
+
+ sra M, 2, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ MUL alpha1, a0, t0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ LD y1, 1 * SIZE(Y1)
+
+ MUL alpha1, a2, t2
+ LD y2, 2 * SIZE(Y1)
+ MUL alpha1, a3, t3
+ LD y3, 3 * SIZE(Y1)
+
+ ADD1 y0, t0, y0
+ unop
+ MUL alpha3, a4, t0
+ LD y4, 4 * SIZE(Y1)
+
+ ADD2 y1, t1, y1
+ unop
+ MUL alpha3, a5, t1
+ LD y5, 5 * SIZE(Y1)
+
+ ADD1 y2, t2, y2
+ unop
+ MUL alpha3, a6, t2
+ LD y6, 6 * SIZE(Y1)
+
+ ADD2 y3, t3, y3
+ unop
+ MUL alpha3, a7, t3
+ LD y7, 7 * SIZE(Y1)
+
+ ADD1 y0, t0, y0
+ unop
+ MUL alpha2, a1, t0
+ LD a1, 5 * SIZE(A1)
+
+ ADD2 y1, t1, y1
+ unop
+ MUL alpha2, a0, t1
+ LD a0, 4 * SIZE(A1)
+
+ ADD1 y2, t2, y2
+ unop
+ MUL alpha2, a3, t2
+ LD a3, 7 * SIZE(A1)
+
+ ADD2 y3, t3, y3
+ unop
+ MUL alpha2, a2, t3
+ LD a2, 6 * SIZE(A1)
+
+ ADD3 y0, t0, y0
+ unop
+ MUL alpha4, a5, t0
+ LD a5, 5 * SIZE(A2)
+
+ ADD4 y1, t1, y1
+ unop
+ MUL alpha4, a4, t1
+ LD a4, 4 * SIZE(A2)
+
+ ADD3 y2, t2, y2
+ unop
+ MUL alpha4, a7, t2
+ LD a7, 7 * SIZE(A2)
+
+ ADD4 y3, t3, y3
+ unop
+ MUL alpha4, a6, t3
+ LD a6, 6 * SIZE(A2)
+
+ ADD3 y0, t0, y0
+ MUL alpha1, a0, t0
+ ADD4 y1, t1, y1
+ MUL alpha1, a1, t1
+
+ ADD3 y2, t2, y2
+ unop
+ MUL alpha1, a2, t2
+ unop
+
+ ADD4 y3, t3, y3
+ ldi I, -1(I)
+ MUL alpha1, a3, t3
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD1 y4, t0, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha3, a4, t0
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+
+ ADD2 y5, t1, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha3, a5, t1
+ ldi I, -1(I)
+
+ ADD1 y6, t2, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha3, a6, t2
+ unop
+
+ ADD2 y7, t3, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha3, a7, t3
+ unop
+
+ ADD1 y4, t0, y4
+ unop
+ MUL alpha2, a1, t0
+ LD a1, 9 * SIZE(A1)
+
+ ADD2 y5, t1, y5
+ unop
+ MUL alpha2, a0, t1
+ LD a0, 8 * SIZE(A1)
+
+ ADD1 y6, t2, y6
+ unop
+ MUL alpha2, a3, t2
+ LD a3, 11 * SIZE(A1)
+
+ ADD2 y7, t3, y7
+ unop
+ MUL alpha2, a2, t3
+ LD a2, 10 * SIZE(A1)
+
+ ADD3 y4, t0, y4
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
+ MUL alpha4, a5, t0
+ LD a5, 9 * SIZE(A2)
+
+ ADD4 y5, t1, y5
+ unop
+ MUL alpha4, a4, t1
+ LD a4, 8 * SIZE(A2)
+
+ ADD3 y6, t2, y6
+ unop
+ MUL alpha4, a7, t2
+ LD a7, 11 * SIZE(A2)
+
+ ADD4 y7, t3, y7
+ unop
+ MUL alpha4, a6, t3
+ LD a6, 10 * SIZE(A2)
+
+ ADD3 y4, t0, y4
+ unop
+ MUL alpha1, a0, t0
+ LD y0, 8 * SIZE(Y1)
+
+ ADD4 y5, t1, y5
+ unop
+ MUL alpha1, a1, t1
+ LD y1, 9 * SIZE(Y1)
+
+ ADD3 y6, t2, y6
+ unop
+ MUL alpha1, a2, t2
+ LD y2, 10 * SIZE(Y1)
+
+ ADD4 y7, t3, y7
+ unop
+ MUL alpha1, a3, t3
+ LD y3, 11 * SIZE(Y1)
+
+ ADD1 y0, t0, y0
+ ST y4, 4 * SIZE(Y1)
+ MUL alpha3, a4, t0
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
+
+ ADD2 y1, t1, y1
+ ST y5, 5 * SIZE(Y1)
+ MUL alpha3, a5, t1
+ unop
+
+ ADD1 y2, t2, y2
+ ST y6, 6 * SIZE(Y1)
+ MUL alpha3, a6, t2
+ unop
+
+ ADD2 y3, t3, y3
+ ST y7, 7 * SIZE(Y1)
+ MUL alpha3, a7, t3
+ ldi Y1, 8 * SIZE(Y1)
+
+ ADD1 y0, t0, y0
+ unop
+ MUL alpha2, a1, t0
+ LD a1, 13 * SIZE(A1)
+
+ ADD2 y1, t1, y1
+ unop
+ MUL alpha2, a0, t1
+ LD a0, 12 * SIZE(A1)
+
+ ADD1 y2, t2, y2
+ unop
+ MUL alpha2, a3, t2
+ LD a3, 15 * SIZE(A1)
+
+ ADD2 y3, t3, y3
+ unop
+ MUL alpha2, a2, t3
+ LD a2, 14 * SIZE(A1)
+
+ ADD3 y0, t0, y0
+ unop
+ MUL alpha4, a5, t0
+ LD a5, 13 * SIZE(A2)
+
+ ADD4 y1, t1, y1
+ unop
+ MUL alpha4, a4, t1
+ LD a4, 12 * SIZE(A2)
+
+ ADD3 y2, t2, y2
+ unop
+ MUL alpha4, a7, t2
+ LD a7, 15 * SIZE(A2)
+
+ ADD4 y3, t3, y3
+ unop
+ MUL alpha4, a6, t3
+ LD a6, 14 * SIZE(A2)
+
+ ADD3 y0, t0, y0
+ unop
+ MUL alpha1, a0, t0
+ LD y4, 4 * SIZE(Y1)
+
+ ADD4 y1, t1, y1
+ ldi A2, 8 * SIZE(A2)
+ MUL alpha1, a1, t1
+ LD y5, 5 * SIZE(Y1)
+
+ ADD3 y2, t2, y2
+ ldi A1, 8 * SIZE(A1)
+ MUL alpha1, a2, t2
+ LD y6, 6 * SIZE(Y1)
+
+ ADD4 y3, t3, y3
+ MUL alpha1, a3, t3
+ LD y7, 7 * SIZE(Y1)
+ bgt I, $L12
+ .align 4
+
+$L13:
+ ADD1 y4, t0, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha3, a4, t0
+ unop
+
+ ADD2 y5, t1, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha3, a5, t1
+ unop
+
+ ADD1 y6, t2, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha3, a6, t2
+ unop
+
+ ADD2 y7, t3, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha3, a7, t3
+ unop
+
+ ADD1 y4, t0, y4
+ MUL alpha2, a1, t0
+ ADD2 y5, t1, y5
+ MUL alpha2, a0, t1
+
+ ADD1 y6, t2, y6
+ MUL alpha2, a3, t2
+ ADD2 y7, t3, y7
+ MUL alpha2, a2, t3
+
+ ADD3 y4, t0, y4
+ MUL alpha4, a5, t0
+ ADD4 y5, t1, y5
+ MUL alpha4, a4, t1
+
+ ADD3 y6, t2, y6
+ MUL alpha4, a7, t2
+ ADD4 y7, t3, y7
+ MUL alpha4, a6, t3
+
+ ADD3 y4, t0, y4
+ ADD4 y5, t1, y5
+ ADD3 y6, t2, y6
+ ADD4 y7, t3, y7
+
+ ST y4, 4 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y5, 5 * SIZE(Y1)
+ ldi A2, 8 * SIZE(A2)
+
+ ST y6, 6 * SIZE(Y1)
+ unop
+ ST y7, 7 * SIZE(Y1)
+ ldi Y1, 8 * SIZE(Y1)
+ .align 4
+
+$L15:
+ and M, 2, I
+ ble I, $L17
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD a4, 0 * SIZE(A2)
+ LD a5, 1 * SIZE(A2)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ MUL alpha1, a0, t0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ LD y1, 1 * SIZE(Y1)
+ MUL alpha1, a2, t2
+ LD y2, 2 * SIZE(Y1)
+ MUL alpha1, a3, t3
+ LD y3, 3 * SIZE(Y1)
+
+ ADD1 y0, t0, y0
+ MUL alpha3, a4, t0
+ ADD2 y1, t1, y1
+ MUL alpha3, a5, t1
+ ADD1 y2, t2, y2
+ MUL alpha3, a6, t2
+ ADD2 y3, t3, y3
+ MUL alpha3, a7, t3
+
+ ADD1 y0, t0, y0
+ MUL alpha2, a1, t0
+ ADD2 y1, t1, y1
+ MUL alpha2, a0, t1
+
+ ADD1 y2, t2, y2
+ MUL alpha2, a3, t2
+ ADD2 y3, t3, y3
+ MUL alpha2, a2, t3
+
+ ADD3 y0, t0, y0
+ MUL alpha4, a5, t0
+ ADD4 y1, t1, y1
+ MUL alpha4, a4, t1
+
+ ADD3 y2, t2, y2
+ MUL alpha4, a7, t2
+ ADD4 y3, t3, y3
+ MUL alpha4, a6, t3
+
+ ADD3 y0, t0, y0
+ ADD4 y1, t1, y1
+ ADD3 y2, t2, y2
+ ADD4 y3, t3, y3
+
+ ST y0, 0 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y1, 1 * SIZE(Y1)
+ ldi A2, 4 * SIZE(A2)
+
+ ST y2, 2 * SIZE(Y1)
+ unop
+ ST y3, 3 * SIZE(Y1)
+ ldi Y1, 4 * SIZE(Y1)
+ .align 4
+
+$L17:
+ blbc M, $L18
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+
+ MUL alpha1, a0, t0
+ MUL alpha1, a1, t1
+
+ ADD1 y0, t0, y0
+ MUL alpha3, a2, t0
+ ADD2 y1, t1, y1
+ MUL alpha3, a3, t1
+
+ ADD1 y0, t0, y0
+ MUL alpha2, a1, t0
+ ADD2 y1, t1, y1
+ MUL alpha2, a0, t1
+
+ ADD3 y0, t0, y0
+ MUL alpha4, a3, t0
+ ADD4 y1, t1, y1
+ MUL alpha4, a2, t1
+
+ ADD3 y0, t0, y0
+ ADD4 y1, t1, y1
+
+ ST y0, 0 * SIZE(Y1)
+ ST y1, 1 * SIZE(Y1)
+ .align 4
+
+$L18:
+ ldi J, -1(J)
+ bgt J, $L11
+ .align 4
+
+$L20:
+ blbc N, $L990
+
+ LD alpha1, 0 * SIZE(X)
+ LD alpha2, 1 * SIZE(X)
+
+ MUL alpha_r, alpha1, y0
+ MUL alpha_r, alpha2, y1
+
+ MUL alpha_i, alpha2, t0
+ mov A, A1
+ MUL alpha_i, alpha1, t1
+ mov Y, Y1
+
+#ifndef XCONJ
+ SUB y0, t0, alpha1
+ ADD y1, t1, alpha2
+#else
+ ADD y0, t0, alpha1
+ SUB y1, t1, alpha2
+#endif
+
+ sra M, 2, I
+ ble I, $L25
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ LD y0, 0 * SIZE(Y1)
+ LD y1, 1 * SIZE(Y1)
+ LD y2, 2 * SIZE(Y1)
+ LD y3, 3 * SIZE(Y1)
+
+ MUL alpha1, a0, t0
+ LD a4, 4 * SIZE(A1)
+ MUL alpha1, a1, t1
+ LD a5, 5 * SIZE(A1)
+ MUL alpha1, a2, t2
+ LD a6, 6 * SIZE(A1)
+ MUL alpha1, a3, t3
+ LD a7, 7 * SIZE(A1)
+
+ ADD1 y0, t0, y0
+ unop
+ MUL alpha2, a1, t0
+ LD a1, 9 * SIZE(A1)
+
+ ADD2 y1, t1, y1
+ unop
+ MUL alpha2, a0, t1
+ LD a0, 8 * SIZE(A1)
+
+ ADD1 y2, t2, y2
+ unop
+ MUL alpha2, a3, t2
+ LD a3, 11 * SIZE(A1)
+
+ ADD2 y3, t3, y3
+ unop
+ MUL alpha2, a2, t3
+ LD a2, 10 * SIZE(A1)
+
+ ADD3 y0, t0, y0
+ unop
+ LD y4, 4 * SIZE(Y1)
+ MUL alpha1, a4, t0
+
+ ADD4 y1, t1, y1
+ unop
+ LD y5, 5 * SIZE(Y1)
+ MUL alpha1, a5, t1
+
+ ADD3 y2, t2, y2
+ LD y6, 6 * SIZE(Y1)
+ MUL alpha1, a6, t2
+ ldi I, -1(I)
+
+ ADD4 y3, t3, y3
+ LD y7, 7 * SIZE(Y1)
+ MUL alpha1, a7, t3
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD1 y4, t0, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a5, t0
+ LD a5, 13 * SIZE(A1)
+
+ ADD2 y5, t1, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a4, t1
+ LD a4, 12 * SIZE(A1)
+
+ ADD1 y6, t2, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a7, t2
+ LD a7, 15 * SIZE(A1)
+
+ ADD2 y7, t3, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a6, t3
+ LD a6, 14 * SIZE(A1)
+
+ ADD3 y4, t0, y4
+ LD y0, 8 * SIZE(Y1)
+ MUL alpha1, a0, t0
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+
+ ADD4 y5, t1, y5
+ LD y1, 9 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ ldi I, -1(I)
+
+ ADD3 y6, t2, y6
+ LD y2, 10 * SIZE(Y1)
+ MUL alpha1, a2, t2
+ unop
+
+ ADD4 y7, t3, y7
+ LD y3, 11 * SIZE(Y1)
+ MUL alpha1, a3, t3
+ unop
+
+ ADD1 y0, t0, y0
+ ST y4, 4 * SIZE(Y1)
+ MUL alpha2, a1, t0
+ LD a1, 17 * SIZE(A1)
+
+ ADD2 y1, t1, y1
+ ST y5, 5 * SIZE(Y1)
+ MUL alpha2, a0, t1
+ LD a0, 16 * SIZE(A1)
+
+ ADD1 y2, t2, y2
+ ST y6, 6 * SIZE(Y1)
+ MUL alpha2, a3, t2
+ LD a3, 19 * SIZE(A1)
+
+ ADD2 y3, t3, y3
+ ST y7, 7 * SIZE(Y1)
+ MUL alpha2, a2, t3
+ LD a2, 18 * SIZE(A1)
+
+ ADD3 y0, t0, y0
+ LD y4, 12 * SIZE(Y1)
+ MUL alpha1, a4, t0
+ fillcs (PREFETCHSIZE + 0) * SIZE(Y1)
+
+ ADD4 y1, t1, y1
+ LD y5, 13 * SIZE(Y1)
+ MUL alpha1, a5, t1
+ ldi A1, 8 * SIZE(A1)
+
+ ADD3 y2, t2, y2
+ LD y6, 14 * SIZE(Y1)
+ MUL alpha1, a6, t2
+ ldi Y1, 8 * SIZE(Y1)
+
+ ADD4 y3, t3, y3
+ LD y7, 7 * SIZE(Y1)
+ MUL alpha1, a7, t3
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD1 y4, t0, y4
+ ST y0, 0 * SIZE(Y1)
+ MUL alpha2, a5, t0
+ unop
+
+ ADD2 y5, t1, y5
+ ST y1, 1 * SIZE(Y1)
+ MUL alpha2, a4, t1
+ unop
+
+ ADD1 y6, t2, y6
+ ST y2, 2 * SIZE(Y1)
+ MUL alpha2, a7, t2
+ unop
+
+ ADD2 y7, t3, y7
+ ST y3, 3 * SIZE(Y1)
+ MUL alpha2, a6, t3
+ unop
+
+ ADD3 y4, t0, y4
+ ADD4 y5, t1, y5
+ ADD3 y6, t2, y6
+ ADD4 y7, t3, y7
+
+ ST y4, 4 * SIZE(Y1)
+ unop
+ ST y5, 5 * SIZE(Y1)
+ unop
+
+ ST y6, 6 * SIZE(Y1)
+ ldi A1, 8 * SIZE(A1)
+ ST y7, 7 * SIZE(Y1)
+ ldi Y1, 8 * SIZE(Y1)
+ .align 4
+
+$L25:
+ and M, 2, I
+ ble I, $L27
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 2 * SIZE(A1)
+ LD a3, 3 * SIZE(A1)
+
+ MUL alpha1, a0, t0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ LD y1, 1 * SIZE(Y1)
+ MUL alpha1, a2, t2
+ LD y2, 2 * SIZE(Y1)
+ MUL alpha1, a3, t3
+ LD y3, 3 * SIZE(Y1)
+
+ ADD1 y0, t0, y0
+ MUL alpha2, a1, t0
+ ADD2 y1, t1, y1
+ MUL alpha2, a0, t1
+ ADD1 y2, t2, y2
+ MUL alpha2, a3, t2
+ ADD2 y3, t3, y3
+ MUL alpha2, a2, t3
+
+ ADD3 y0, t0, y0
+ ADD4 y1, t1, y1
+ ADD3 y2, t2, y2
+ ADD4 y3, t3, y3
+
+ ST y0, 0 * SIZE(Y1)
+ ST y1, 1 * SIZE(Y1)
+
+ ST y2, 2 * SIZE(Y1)
+ ldi A1, 4 * SIZE(A1)
+ ST y3, 3 * SIZE(Y1)
+ ldi Y1, 4 * SIZE(Y1)
+ .align 4
+
+$L27:
+ blbc M, $L990
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+
+ MUL alpha1, a0, t0
+ LD y0, 0 * SIZE(Y1)
+ MUL alpha1, a1, t1
+ LD y1, 1 * SIZE(Y1)
+
+ ADD1 y0, t0, y0
+ MUL alpha2, a1, t0
+ ADD2 y1, t1, y1
+ MUL alpha2, a0, t1
+
+ ADD3 y0, t0, y0
+ ADD4 y1, t1, y1
+
+ ST y0, 0 * SIZE(Y1)
+ ST y1, 1 * SIZE(Y1)
+ .align 4
+
+$L990:
+ cmpeq INCY, 2 * SIZE, $0
+ bne $0, $L999
+
+ mov BUFFER, Y1
+
+ sra M, 2, I
+ ble I, $L995
+ .align 4
+
+$L992:
+ LD a0, 0 * SIZE(BUFFER)
+ LD a1, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a2, 0 * SIZE(BUFFER)
+ LD a3, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y0, 0 * SIZE(Y)
+ LD y1, 1 * SIZE(Y)
+ LD y2, 2 * SIZE(Y)
+ LD y3, 3 * SIZE(Y)
+
+ LD a4, 0 * SIZE(BUFFER)
+ LD a5, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+ LD a6, 0 * SIZE(BUFFER)
+ LD a7, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y4, 4 * SIZE(Y)
+ LD y5, 5 * SIZE(Y)
+ LD y6, 6 * SIZE(Y)
+ LD y7, 7 * SIZE(Y)
+
+ ADD a0, y0, a0
+ ADD a1, y1, a1
+ ADD a2, y2, a2
+ ADD a3, y3, a3
+
+ ST a0, 0 * SIZE(Y1)
+ ADD a4, y4, a4
+ ST a1, 1 * SIZE(Y1)
+ ADD a5, y5, a5
+ addl Y1, INCY, Y1
+
+ ST a2, 0 * SIZE(Y1)
+ ADD a6, y6, a6
+ ST a3, 1 * SIZE(Y1)
+ ADD a7, y7, a7
+ addl Y1, INCY, Y1
+
+ ST a4, 0 * SIZE(Y1)
+ ST a5, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+ ST a6, 0 * SIZE(Y1)
+ ST a7, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi I, -1(I)
+ ldi Y, 8 * SIZE(Y)
+ bgt I, $L992
+ .align 4
+
+$L995:
+ and M, 3, I
+ ble I, $L999
+ .align 4
+
+$L996:
+ LD a0, 0 * SIZE(BUFFER)
+ LD a1, 1 * SIZE(BUFFER)
+ addl BUFFER, INCY, BUFFER
+
+ LD y0, 0 * SIZE(Y)
+ LD y1, 1 * SIZE(Y)
+ ldi Y, 2 * SIZE(Y)
+
+ ADD a0, y0, a0
+ ADD a1, y1, a1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ldi I, -1(I)
+ bgt I, $L996
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zgemv_t.S b/kernel/sw_64/zgemv_t.S
new file mode 100644
index 0000000..bf31cb4
--- /dev/null
+++ b/kernel/sw_64/zgemv_t.S
@@ -0,0 +1,1047 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define STACKSIZE 64
+#define PREFETCHSIZE 32
+
+#define M $16
+#define N $17
+#define A $21
+#define LDA $18
+
+#define X $19
+#define INCX $20
+#define Y $22
+#define INCY $23
+
+#define BUFFER $24
+
+#define I $25
+#define J $27
+
+#define X1 $3
+#define Y1 $4
+#define A1 $5
+#define A2 $6
+
+#define alpha_r $f19
+#define alpha_i $f20
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f21
+
+#define a0 $f22
+#define a1 $f23
+#define a2 $f24
+#define a3 $f25
+#define a4 $f26
+#define a5 $f27
+#define a6 $f28
+#define a7 $f29
+
+#define a8 $f2
+#define a9 $f3
+#define a10 $f4
+#define a11 $f5
+#define a12 $f6
+#define a13 $f7
+#define a14 $f8
+#define a15 $f9
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#elif !defined(CONJ) && defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#elif defined(CONJ) && !defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 SUB
+#define ADD4 SUB
+#endif
+
+ PROLOGUE
+
+ ldi $sp, -STACKSIZE($sp)
+ ldl LDA, 0 + STACKSIZE($sp)
+ ldl X, 8 + STACKSIZE($sp)
+ ldl INCX, 16 + STACKSIZE($sp)
+ ldl Y, 24 + STACKSIZE($sp)
+ ldl INCY, 32 + STACKSIZE($sp)
+ ldl BUFFER, 40 + STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ PROFCODE
+
+ cmple M, 0, $0
+ sll INCX, ZBASE_SHIFT, INCX
+ cmple N, 0, $1
+ sll INCY, ZBASE_SHIFT, INCY
+
+ or $0, $1, $0
+ bne $0, $L999
+
+ cmpeq INCX, 2 * SIZE, $0
+ mov X, X1
+ sll LDA, ZBASE_SHIFT,LDA
+ bne $0, $L10
+
+ sra M, 2, I
+ mov BUFFER, Y1
+ mov BUFFER, X
+ ble I, $L05
+ .align 4
+
+$L02:
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
+ ldi I, -1(I)
+
+ LD a0, 0 * SIZE(X1)
+ LD a1, 1 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a2, 0 * SIZE(X1)
+ LD a3, 1 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ ST a2, 2 * SIZE(Y1)
+ ST a3, 3 * SIZE(Y1)
+
+ LD a4, 0 * SIZE(X1)
+ LD a5, 1 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a6, 0 * SIZE(X1)
+ LD a7, 1 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a4, 4 * SIZE(Y1)
+ ST a5, 5 * SIZE(Y1)
+ ST a6, 6 * SIZE(Y1)
+ ST a7, 7 * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ bgt I, $L02
+ .align 4
+
+$L05:
+ and M, 3, I
+ ble I, $L10
+ .align 4
+
+$L06:
+ LD a0, 0 * SIZE(X1)
+ LD a1, 1 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ ldi Y1, 2 * SIZE(Y1)
+
+ ldi I, -1(I)
+ bgt I, $L06
+ .align 4
+
+$L10:
+ mov Y, Y1
+ fclr t0
+ unop
+ fclr t1
+
+ sra N, 1, J
+ fclr t2
+ fclr t3
+ ble J, $L20
+ .align 4
+
+$L11:
+ mov A, A1
+ fclr s0
+ addl A, LDA, A2
+ fclr s1
+
+ addl A2, LDA, A
+ unop
+ mov X, X1
+ fillcs 3 * SIZE(Y)
+
+ sra M, 2, I
+ fclr s2
+ fclr s3
+ ble I, $L15
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+ LD a4, 2 * SIZE(A1)
+ LD a5, 3 * SIZE(A1)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ LD a8, 4 * SIZE(A1)
+ LD a9, 5 * SIZE(A1)
+ LD a10, 4 * SIZE(A2)
+ LD a11, 5 * SIZE(A2)
+ LD a12, 6 * SIZE(A1)
+ LD a13, 7 * SIZE(A1)
+ LD a14, 6 * SIZE(A2)
+ LD a15, 7 * SIZE(A2)
+
+ LD x0, 0 * SIZE(X1)
+ LD x1, 1 * SIZE(X1)
+ LD x2, 2 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x0, a0, t0
+ LD x3, 3 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+ MUL x0, a1, t1
+ unop
+
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ unop
+ MUL x0, a2, t2
+ unop
+
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x0, a3, t3
+ LD x0, 4 * SIZE(X1)
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x1, a1, t0
+ LD a1, 9 * SIZE(A1)
+
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x1, a0, t1
+ LD a0, 8 * SIZE(A1)
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ unop
+ MUL x1, a3, t2
+ LD a3, 9 * SIZE(A2)
+
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x1, a2, t3
+ LD a2, 8 * SIZE(A2)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x2, a4, t0
+ LD x1, 5 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ MUL x2, a5, t1
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ MUL x2, a6, t2
+
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x2, a7, t3
+ LD x2, 6 * SIZE(X1)
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x3, a5, t0
+ LD a5, 11 * SIZE(A1)
+
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x3, a4, t1
+ LD a4, 10 * SIZE(A1)
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ unop
+ MUL x3, a7, t2
+ LD a7, 11 * SIZE(A2)
+
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x3, a6, t3
+ LD a6, 10 * SIZE(A2)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x0, a8, t0
+ LD x3, 7 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
+ MUL x0, a9, t1
+ unop
+
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ ldi I, -1(I)
+ MUL x0, a10, t2
+ unop
+
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x0, a11, t3
+ LD x0, 8 * SIZE(X1)
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x1, a9, t0
+ LD a9, 13 * SIZE(A1)
+
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x1, a8, t1
+ LD a8, 12 * SIZE(A1)
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ ldi A1, 8 * SIZE(A1)
+ MUL x1, a11, t2
+ LD a11, 13 * SIZE(A2)
+
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x1, a10, t3
+ LD a10, 12 * SIZE(A2)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x2, a12, t0
+ LD x1, 9 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
+ MUL x2, a13, t1
+ ldi A2, 8 * SIZE(A2)
+
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ unop
+ MUL x2, a14, t2
+ unop
+
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x2, a15, t3
+ LD x2, 10 * SIZE(X1)
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x3, a13, t0
+ LD a13, 7 * SIZE(A1)
+
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ ldi X1, 8 * SIZE(X1)
+ MUL x3, a12, t1
+ LD a12, 6 * SIZE(A1)
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ unop
+ MUL x3, a15, t2
+ LD a15, 7 * SIZE(A2)
+
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ MUL x3, a14, t3
+ LD a14, 6 * SIZE(A2)
+ bgt I, $L12
+ .align 4
+
+$L13:
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x0, a0, t0
+ LD x3, 3 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ MUL x0, a1, t1
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ MUL x0, a2, t2
+
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x0, a3, t3
+ LD x0, 4 * SIZE(X1)
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ MUL x1, a1, t0
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ MUL x1, a0, t1
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ unop
+ MUL x1, a3, t2
+ unop
+
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ ldi A1, 8 * SIZE(A1)
+ MUL x1, a2, t3
+ LD x1, 5 * SIZE(X1)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ MUL x2, a4, t0
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ MUL x2, a5, t1
+
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ unop
+ MUL x2, a6, t2
+ unop
+
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ ldi A2, 8 * SIZE(A2)
+ MUL x2, a7, t3
+ LD x2, 6 * SIZE(X1)
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ MUL x3, a5, t0
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ MUL x3, a4, t1
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ unop
+ MUL x3, a7, t2
+ ldi X1, 8 * SIZE(X1)
+
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x3, a6, t3
+ LD x3, -1 * SIZE(X1)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ MUL x0, a8, t0
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ MUL x0, a9, t1
+
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ MUL x0, a10, t2
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ MUL x0, a11, t3
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ MUL x1, a9, t0
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ MUL x1, a8, t1
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ MUL x1, a11, t2
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ MUL x1, a10, t3
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ MUL x2, a12, t0
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ MUL x2, a13, t1
+
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ MUL x2, a14, t2
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ MUL x2, a15, t3
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ MUL x3, a13, t0
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ MUL x3, a12, t1
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ MUL x3, a15, t2
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ MUL x3, a14, t3
+ .align 4
+
+$L15:
+ and M, 3, I
+ ble I, $L18
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+
+ LD x0, 0 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L17
+ .align 4
+
+$L16:
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ ldi I, -1(I)
+ MUL x0, a0, t0
+ LD x1, 1 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ MUL x0, a1, t1
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ MUL x0, a2, t2
+
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ unop
+ MUL x0, a3, t3
+ LD x0, 2 * SIZE(X1)
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ ldi A2, 2 * SIZE(A2)
+ MUL x1, a1, t0
+ LD a1, 3 * SIZE(A1)
+
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ ldi X1, 2 * SIZE(X1)
+ MUL x1, a0, t1
+ LD a0, 2 * SIZE(A1)
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ ldi A1, 2 * SIZE(A1)
+ MUL x1, a3, t2
+ LD a3, 1 * SIZE(A2)
+
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ MUL x1, a2, t3
+ LD a2, 0 * SIZE(A2)
+ bgt I, $L16
+ .align 4
+
+$L17:
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x0, a0, t0
+ LD x1, 1 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x0, a1, t1
+ unop
+
+ ADD3 s2, t2, $f30
+ fmov $f30, s2
+ MUL x0, a2, t2
+ ADD4 s3, t3, $f30
+ fmov $f30, s3
+ MUL x0, a3, t3
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ MUL x1, a1, t0
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ MUL x1, a0, t1
+
+ ADD1 s2, t2, $f30
+ fmov $f30, s2
+ MUL x1, a3, t2
+ ADD2 s3, t3, $f30
+ fmov $f30, s3
+ MUL x1, a2, t3
+ .align 4
+
+$L18:
+ LD a0, 0 * SIZE(Y)
+ unop
+ LD a1, 1 * SIZE(Y)
+ addl Y, INCY, Y
+
+ LD a2, 0 * SIZE(Y)
+ unop
+ LD a3, 1 * SIZE(Y)
+ addl Y, INCY, Y
+
+ ADD3 s0, t0, a8
+ ADD4 s1, t1, a9
+ ADD3 s2, t2, a10
+ ADD4 s3, t3, a11
+
+ fmov a8, s0
+ fmov a9, s1
+ fmov a10, s2
+ fmov a11, s3
+
+ MUL alpha_r, s0, t0
+ MUL alpha_r, s1, t1
+ MUL alpha_r, s2, t2
+ MUL alpha_r, s3, t3
+
+ ADD a0, t0, a8
+ MUL alpha_i, s1, t0
+ ADD a1, t1, a9
+ MUL alpha_i, s0, t1
+ ADD a2, t2, a10
+ MUL alpha_i, s3, t2
+ ADD a3, t3, a11
+ MUL alpha_i, s2, t3
+
+ SUB a8, t0, a0
+ ADD a9, t1, a1
+ SUB a10, t2, a2
+ ADD a11, t3, a3
+
+ ST a0, 0 * SIZE(Y1)
+ fclr t0
+ ST a1, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ST a2, 0 * SIZE(Y1)
+ fclr t1
+ ST a3, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ fclr t2
+ ldi J, -1(J)
+ fclr t3
+ bgt J, $L11
+ .align 4
+
+$L20:
+ blbc N, $L999
+
+ mov A, A1
+ fclr s0
+ fclr s1
+ mov X, X1
+
+ sra M, 2, I
+ fclr s2
+ fclr s3
+ ble I, $L25
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a4, 2 * SIZE(A1)
+ LD a5, 3 * SIZE(A1)
+ LD a8, 4 * SIZE(A1)
+ LD a9, 5 * SIZE(A1)
+ LD a12, 6 * SIZE(A1)
+ LD a13, 7 * SIZE(A1)
+
+ LD x0, 0 * SIZE(X1)
+ LD x1, 1 * SIZE(X1)
+ LD x2, 2 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+ MUL x0, a0, t0
+ LD x3, 3 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x0, a1, t1
+ LD x0, 4 * SIZE(X1)
+
+ ADD1 s2, t0, $f30
+ fmov $f30, s2
+ ldi I, -1(I)
+ MUL x1, a1, t0
+ LD a1, 9 * SIZE(A1)
+
+ ADD2 s3, t1, $f30
+ fmov $f30, s3
+ unop
+ MUL x1, a0, t1
+ LD a0, 8 * SIZE(A1)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x2, a4, t0
+ LD x1, 5 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x2, a5, t1
+ LD x2, 6 * SIZE(X1)
+
+ ADD1 s2, t0, $f30
+ fmov $f30, s2
+ unop
+ MUL x3, a5, t0
+ LD a5, 11 * SIZE(A1)
+
+ ADD2 s3, t1, $f30
+ fmov $f30, s3
+ unop
+ MUL x3, a4, t1
+ LD a4, 10 * SIZE(A1)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x0, a8, t0
+ LD x3, 7 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x0, a9, t1
+ LD x0, 8 * SIZE(X1)
+
+ ADD1 s2, t0, $f30
+ fmov $f30, s2
+ unop
+ MUL x1, a9, t0
+ LD a9, 13 * SIZE(A1)
+
+ ADD2 s3, t1, $f30
+ fmov $f30, s3
+ unop
+ MUL x1, a8, t1
+ LD a8, 12 * SIZE(A1)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x2, a12, t0
+ LD x1, 9 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ ldi A1, 8 * SIZE(A1)
+ MUL x2, a13, t1
+ LD x2, 10 * SIZE(X1)
+
+ ADD1 s2, t0, $f30
+ fmov $f30, s2
+ ldi X1, 8 * SIZE(X1)
+ MUL x3, a13, t0
+ LD a13, 7 * SIZE(A1)
+
+ ADD2 s3, t1, $f30
+ fmov $f30, s3
+ MUL x3, a12, t1
+ LD a12, 6 * SIZE(A1)
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x0, a0, t0
+ LD x3, 3 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x0, a1, t1
+ LD x0, 4 * SIZE(X1)
+
+ ADD1 s2, t0, $f30
+ fmov $f30, s2
+ unop
+ MUL x1, a1, t0
+ ldi A1, 8 * SIZE(A1)
+
+ ADD2 s3, t1, $f30
+ fmov $f30, s3
+ unop
+ MUL x1, a0, t1
+ LD x1, 5 * SIZE(X1)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x2, a4, t0
+ unop
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x2, a5, t1
+ LD x2, 6 * SIZE(X1)
+
+ ADD1 s2, t0, $f30
+ fmov $f30, s2
+ unop
+ MUL x3, a5, t0
+ ldi X1, 8 * SIZE(X1)
+
+ ADD2 s3, t1, $f30
+ fmov $f30, s3
+ unop
+ MUL x3, a4, t1
+ LD x3, -1 * SIZE(X1)
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ MUL x0, a8, t0
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ MUL x0, a9, t1
+
+ ADD1 s2, t0, $f30
+ fmov $f30, s2
+ MUL x1, a9, t0
+ ADD2 s3, t1, $f30
+ fmov $f30, s3
+ MUL x1, a8, t1
+
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ MUL x2, a12, t0
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ MUL x2, a13, t1
+
+ ADD1 s2, t0, $f30
+ fmov $f30, s2
+ MUL x3, a13, t0
+ ADD2 s3, t1, $f30
+ fmov $f30, s3
+ MUL x3, a12, t1
+ .align 4
+
+$L25:
+ and M, 3, I
+ ble I, $L28
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+
+ LD x0, 0 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L27
+ .align 4
+
+$L26:
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ ldi A1, 2 * SIZE(A1)
+ MUL x0, a0, t0
+ LD x1, 1 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ ldi I, -1(I)
+ MUL x0, a1, t1
+ LD x0, 2 * SIZE(X1)
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ ldi X1, 2 * SIZE(X1)
+ MUL x1, a1, t0
+ LD a1, 1 * SIZE(A1)
+
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ MUL x1, a0, t1
+ LD a0, 0 * SIZE(A1)
+ bgt I, $L26
+ .align 4
+
+$L27:
+ ADD3 s0, t0, $f30
+ fmov $f30, s0
+ unop
+ MUL x0, a0, t0
+ LD x1, 1 * SIZE(X1)
+
+ ADD4 s1, t1, $f30
+ fmov $f30, s1
+ unop
+ MUL x0, a1, t1
+ unop
+
+ ADD1 s0, t0, $f30
+ fmov $f30, s0
+ MUL x1, a1, t0
+ ADD2 s1, t1, $f30
+ fmov $f30, s1
+ MUL x1, a0, t1
+ .align 4
+
+$L28:
+ LD a0, 0 * SIZE(Y)
+ LD a1, 1 * SIZE(Y)
+
+ ADD3 s0, t0, a8
+ ADD4 s1, t1, a9
+ ADD3 s2, t2, a10
+ ADD4 s3, t3, a11
+
+ ADD a8, a10, s0
+ ADD a9, a11, s1
+
+ MUL alpha_r, s0, t0
+ MUL alpha_r, s1, t1
+
+ ADD a0, t0, a8
+ MUL alpha_i, s1, t0
+ ADD a1, t1, a9
+ MUL alpha_i, s0, t1
+
+ SUB a8, t0, a0
+ ADD a9, t1, a1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zgemv_t.S.bak b/kernel/sw_64/zgemv_t.S.bak
new file mode 100644
index 0000000..f857fb7
--- /dev/null
+++ b/kernel/sw_64/zgemv_t.S.bak
@@ -0,0 +1,922 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define STACKSIZE 64
+#define PREFETCHSIZE 32
+
+#define M $16
+#define N $17
+#define A $21
+#define LDA $18
+
+#define X $19
+#define INCX $20
+#define Y $22
+#define INCY $23
+
+#define BUFFER $24
+
+#define I $25
+#define J $27
+
+#define X1 $3
+#define Y1 $4
+#define A1 $5
+#define A2 $6
+
+#define alpha_r $f19
+#define alpha_i $f20
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f21
+
+#define a0 $f22
+#define a1 $f23
+#define a2 $f24
+#define a3 $f25
+#define a4 $f26
+#define a5 $f27
+#define a6 $f28
+#define a7 $f29
+
+#define a8 $f2
+#define a9 $f3
+#define a10 $f4
+#define a11 $f5
+#define a12 $f6
+#define a13 $f7
+#define a14 $f8
+#define a15 $f9
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#elif !defined(CONJ) && defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#elif defined(CONJ) && !defined(XCONJ)
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#else
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 SUB
+#define ADD4 SUB
+#endif
+
+ PROLOGUE
+
+ ldi $sp, -STACKSIZE($sp)
+ ldl LDA, 0 + STACKSIZE($sp)
+ ldl X, 8 + STACKSIZE($sp)
+ ldl INCX, 16 + STACKSIZE($sp)
+ ldl Y, 24 + STACKSIZE($sp)
+ ldl INCY, 32 + STACKSIZE($sp)
+ ldl BUFFER, 40 + STACKSIZE($sp)
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ PROFCODE
+
+ cmple M, 0, $0
+ sll INCX, ZBASE_SHIFT, INCX
+ cmple N, 0, $1
+ sll INCY, ZBASE_SHIFT, INCY
+
+ or $0, $1, $0
+ bne $0, $L999
+
+ cmpeq INCX, 2 * SIZE, $0
+ mov X, X1
+ sll LDA, ZBASE_SHIFT,LDA
+ bne $0, $L10
+
+ sra M, 2, I
+ mov BUFFER, Y1
+ mov BUFFER, X
+ ble I, $L05
+ .align 4
+
+$L02:
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
+ ldi I, -1(I)
+
+ LD a0, 0 * SIZE(X1)
+ LD a1, 1 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a2, 0 * SIZE(X1)
+ LD a3, 1 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ ST a2, 2 * SIZE(Y1)
+ ST a3, 3 * SIZE(Y1)
+
+ LD a4, 0 * SIZE(X1)
+ LD a5, 1 * SIZE(X1)
+ addl X1, INCX, X1
+ LD a6, 0 * SIZE(X1)
+ LD a7, 1 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a4, 4 * SIZE(Y1)
+ ST a5, 5 * SIZE(Y1)
+ ST a6, 6 * SIZE(Y1)
+ ST a7, 7 * SIZE(Y1)
+
+ ldi Y1, 8 * SIZE(Y1)
+ bgt I, $L02
+ .align 4
+
+$L05:
+ and M, 3, I
+ ble I, $L10
+ .align 4
+
+$L06:
+ LD a0, 0 * SIZE(X1)
+ LD a1, 1 * SIZE(X1)
+ addl X1, INCX, X1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ ldi Y1, 2 * SIZE(Y1)
+
+ ldi I, -1(I)
+ bgt I, $L06
+ .align 4
+
+$L10:
+ mov Y, Y1
+ fclr t0
+ unop
+ fclr t1
+
+ sra N, 1, J
+ fclr t2
+ fclr t3
+ ble J, $L20
+ .align 4
+
+$L11:
+ mov A, A1
+ fclr s0
+ addl A, LDA, A2
+ fclr s1
+
+ addl A2, LDA, A
+ unop
+ mov X, X1
+ fillcs 3 * SIZE(Y)
+
+ sra M, 2, I
+ fclr s2
+ fclr s3
+ ble I, $L15
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+ LD a4, 2 * SIZE(A1)
+ LD a5, 3 * SIZE(A1)
+ LD a6, 2 * SIZE(A2)
+ LD a7, 3 * SIZE(A2)
+
+ LD a8, 4 * SIZE(A1)
+ LD a9, 5 * SIZE(A1)
+ LD a10, 4 * SIZE(A2)
+ LD a11, 5 * SIZE(A2)
+ LD a12, 6 * SIZE(A1)
+ LD a13, 7 * SIZE(A1)
+ LD a14, 6 * SIZE(A2)
+ LD a15, 7 * SIZE(A2)
+
+ LD x0, 0 * SIZE(X1)
+ LD x1, 1 * SIZE(X1)
+ LD x2, 2 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD3 s0, t0, s0
+ unop
+ MUL x0, a0, t0
+ LD x3, 3 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+ MUL x0, a1, t1
+ unop
+
+ ADD3 s2, t2, s2
+ unop
+ MUL x0, a2, t2
+ unop
+
+ ADD4 s3, t3, s3
+ unop
+ MUL x0, a3, t3
+ LD x0, 4 * SIZE(X1)
+
+ ADD1 s0, t0, s0
+ unop
+ MUL x1, a1, t0
+ LD a1, 9 * SIZE(A1)
+
+ ADD2 s1, t1, s1
+ unop
+ MUL x1, a0, t1
+ LD a0, 8 * SIZE(A1)
+
+ ADD1 s2, t2, s2
+ unop
+ MUL x1, a3, t2
+ LD a3, 9 * SIZE(A2)
+
+ ADD2 s3, t3, s3
+ unop
+ MUL x1, a2, t3
+ LD a2, 8 * SIZE(A2)
+
+ ADD3 s0, t0, s0
+ unop
+ MUL x2, a4, t0
+ LD x1, 5 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ MUL x2, a5, t1
+ ADD3 s2, t2, s2
+ MUL x2, a6, t2
+
+ ADD4 s3, t3, s3
+ unop
+ MUL x2, a7, t3
+ LD x2, 6 * SIZE(X1)
+
+ ADD1 s0, t0, s0
+ unop
+ MUL x3, a5, t0
+ LD a5, 11 * SIZE(A1)
+
+ ADD2 s1, t1, s1
+ unop
+ MUL x3, a4, t1
+ LD a4, 10 * SIZE(A1)
+
+ ADD1 s2, t2, s2
+ unop
+ MUL x3, a7, t2
+ LD a7, 11 * SIZE(A2)
+
+ ADD2 s3, t3, s3
+ unop
+ MUL x3, a6, t3
+ LD a6, 10 * SIZE(A2)
+
+ ADD3 s0, t0, s0
+ unop
+ MUL x0, a8, t0
+ LD x3, 7 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(A2)
+ MUL x0, a9, t1
+ unop
+
+ ADD3 s2, t2, s2
+ ldi I, -1(I)
+ MUL x0, a10, t2
+ unop
+
+ ADD4 s3, t3, s3
+ unop
+ MUL x0, a11, t3
+ LD x0, 8 * SIZE(X1)
+
+ ADD1 s0, t0, s0
+ unop
+ MUL x1, a9, t0
+ LD a9, 13 * SIZE(A1)
+
+ ADD2 s1, t1, s1
+ unop
+ MUL x1, a8, t1
+ LD a8, 12 * SIZE(A1)
+
+ ADD1 s2, t2, s2
+ ldi A1, 8 * SIZE(A1)
+ MUL x1, a11, t2
+ LD a11, 13 * SIZE(A2)
+
+ ADD2 s3, t3, s3
+ unop
+ MUL x1, a10, t3
+ LD a10, 12 * SIZE(A2)
+
+ ADD3 s0, t0, s0
+ unop
+ MUL x2, a12, t0
+ LD x1, 9 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ fillcs (PREFETCHSIZE + 0) * SIZE(X1)
+ MUL x2, a13, t1
+ ldi A2, 8 * SIZE(A2)
+
+ ADD3 s2, t2, s2
+ unop
+ MUL x2, a14, t2
+ unop
+
+ ADD4 s3, t3, s3
+ unop
+ MUL x2, a15, t3
+ LD x2, 10 * SIZE(X1)
+
+ ADD1 s0, t0, s0
+ unop
+ MUL x3, a13, t0
+ LD a13, 7 * SIZE(A1)
+
+ ADD2 s1, t1, s1
+ ldi X1, 8 * SIZE(X1)
+ MUL x3, a12, t1
+ LD a12, 6 * SIZE(A1)
+
+ ADD1 s2, t2, s2
+ unop
+ MUL x3, a15, t2
+ LD a15, 7 * SIZE(A2)
+
+ ADD2 s3, t3, s3
+ MUL x3, a14, t3
+ LD a14, 6 * SIZE(A2)
+ bgt I, $L12
+ .align 4
+
+$L13:
+ ADD3 s0, t0, s0
+ unop
+ MUL x0, a0, t0
+ LD x3, 3 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ MUL x0, a1, t1
+ ADD3 s2, t2, s2
+ MUL x0, a2, t2
+
+ ADD4 s3, t3, s3
+ unop
+ MUL x0, a3, t3
+ LD x0, 4 * SIZE(X1)
+
+ ADD1 s0, t0, s0
+ MUL x1, a1, t0
+ ADD2 s1, t1, s1
+ MUL x1, a0, t1
+
+ ADD1 s2, t2, s2
+ unop
+ MUL x1, a3, t2
+ unop
+
+ ADD2 s3, t3, s3
+ ldi A1, 8 * SIZE(A1)
+ MUL x1, a2, t3
+ LD x1, 5 * SIZE(X1)
+
+ ADD3 s0, t0, s0
+ MUL x2, a4, t0
+ ADD4 s1, t1, s1
+ MUL x2, a5, t1
+
+ ADD3 s2, t2, s2
+ unop
+ MUL x2, a6, t2
+ unop
+
+ ADD4 s3, t3, s3
+ ldi A2, 8 * SIZE(A2)
+ MUL x2, a7, t3
+ LD x2, 6 * SIZE(X1)
+
+ ADD1 s0, t0, s0
+ MUL x3, a5, t0
+ ADD2 s1, t1, s1
+ MUL x3, a4, t1
+
+ ADD1 s2, t2, s2
+ unop
+ MUL x3, a7, t2
+ ldi X1, 8 * SIZE(X1)
+
+ ADD2 s3, t3, s3
+ unop
+ MUL x3, a6, t3
+ LD x3, -1 * SIZE(X1)
+
+ ADD3 s0, t0, s0
+ MUL x0, a8, t0
+ ADD4 s1, t1, s1
+ MUL x0, a9, t1
+
+ ADD3 s2, t2, s2
+ MUL x0, a10, t2
+ ADD4 s3, t3, s3
+ MUL x0, a11, t3
+
+ ADD1 s0, t0, s0
+ MUL x1, a9, t0
+ ADD2 s1, t1, s1
+ MUL x1, a8, t1
+
+ ADD1 s2, t2, s2
+ MUL x1, a11, t2
+ ADD2 s3, t3, s3
+ MUL x1, a10, t3
+
+ ADD3 s0, t0, s0
+ MUL x2, a12, t0
+ ADD4 s1, t1, s1
+ MUL x2, a13, t1
+
+ ADD3 s2, t2, s2
+ MUL x2, a14, t2
+ ADD4 s3, t3, s3
+ MUL x2, a15, t3
+
+ ADD1 s0, t0, s0
+ MUL x3, a13, t0
+ ADD2 s1, t1, s1
+ MUL x3, a12, t1
+
+ ADD1 s2, t2, s2
+ MUL x3, a15, t2
+ ADD2 s3, t3, s3
+ MUL x3, a14, t3
+ .align 4
+
+$L15:
+ and M, 3, I
+ ble I, $L18
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a2, 0 * SIZE(A2)
+ LD a3, 1 * SIZE(A2)
+
+ LD x0, 0 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L17
+ .align 4
+
+$L16:
+ ADD3 s0, t0, s0
+ ldi I, -1(I)
+ MUL x0, a0, t0
+ LD x1, 1 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ MUL x0, a1, t1
+ ADD3 s2, t2, s2
+ MUL x0, a2, t2
+
+ ADD4 s3, t3, s3
+ unop
+ MUL x0, a3, t3
+ LD x0, 2 * SIZE(X1)
+
+ ADD1 s0, t0, s0
+ ldi A2, 2 * SIZE(A2)
+ MUL x1, a1, t0
+ LD a1, 3 * SIZE(A1)
+
+ ADD2 s1, t1, s1
+ ldi X1, 2 * SIZE(X1)
+ MUL x1, a0, t1
+ LD a0, 2 * SIZE(A1)
+
+ ADD1 s2, t2, s2
+ ldi A1, 2 * SIZE(A1)
+ MUL x1, a3, t2
+ LD a3, 1 * SIZE(A2)
+
+ ADD2 s3, t3, s3
+ MUL x1, a2, t3
+ LD a2, 0 * SIZE(A2)
+ bgt I, $L16
+ .align 4
+
+$L17:
+ ADD3 s0, t0, s0
+ unop
+ MUL x0, a0, t0
+ LD x1, 1 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ unop
+ MUL x0, a1, t1
+ unop
+
+ ADD3 s2, t2, s2
+ MUL x0, a2, t2
+ ADD4 s3, t3, s3
+ MUL x0, a3, t3
+
+ ADD1 s0, t0, s0
+ MUL x1, a1, t0
+ ADD2 s1, t1, s1
+ MUL x1, a0, t1
+
+ ADD1 s2, t2, s2
+ MUL x1, a3, t2
+ ADD2 s3, t3, s3
+ MUL x1, a2, t3
+ .align 4
+
+$L18:
+ LD a0, 0 * SIZE(Y)
+ unop
+ LD a1, 1 * SIZE(Y)
+ addl Y, INCY, Y
+
+ LD a2, 0 * SIZE(Y)
+ unop
+ LD a3, 1 * SIZE(Y)
+ addl Y, INCY, Y
+
+ ADD3 s0, t0, s0
+ ADD4 s1, t1, s1
+ ADD3 s2, t2, s2
+ ADD4 s3, t3, s3
+
+ MUL alpha_r, s0, t0
+ MUL alpha_r, s1, t1
+ MUL alpha_r, s2, t2
+ MUL alpha_r, s3, t3
+
+ ADD a0, t0, a0
+ MUL alpha_i, s1, t0
+ ADD a1, t1, a1
+ MUL alpha_i, s0, t1
+ ADD a2, t2, a2
+ MUL alpha_i, s3, t2
+ ADD a3, t3, a3
+ MUL alpha_i, s2, t3
+
+ SUB a0, t0, a0
+ ADD a1, t1, a1
+ SUB a2, t2, a2
+ ADD a3, t3, a3
+
+ ST a0, 0 * SIZE(Y1)
+ fclr t0
+ ST a1, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ ST a2, 0 * SIZE(Y1)
+ fclr t1
+ ST a3, 1 * SIZE(Y1)
+ addl Y1, INCY, Y1
+
+ fclr t2
+ ldi J, -1(J)
+ fclr t3
+ bgt J, $L11
+ .align 4
+
+$L20:
+ blbc N, $L999
+
+ mov A, A1
+ fclr s0
+ fclr s1
+ mov X, X1
+
+ sra M, 2, I
+ fclr s2
+ fclr s3
+ ble I, $L25
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+ LD a4, 2 * SIZE(A1)
+ LD a5, 3 * SIZE(A1)
+ LD a8, 4 * SIZE(A1)
+ LD a9, 5 * SIZE(A1)
+ LD a12, 6 * SIZE(A1)
+ LD a13, 7 * SIZE(A1)
+
+ LD x0, 0 * SIZE(X1)
+ LD x1, 1 * SIZE(X1)
+ LD x2, 2 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L23
+ .align 4
+
+$L22:
+ ADD3 s0, t0, s0
+ fillcs (PREFETCHSIZE + 0) * SIZE(A1)
+ MUL x0, a0, t0
+ LD x3, 3 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ unop
+ MUL x0, a1, t1
+ LD x0, 4 * SIZE(X1)
+
+ ADD1 s2, t0, s2
+ ldi I, -1(I)
+ MUL x1, a1, t0
+ LD a1, 9 * SIZE(A1)
+
+ ADD2 s3, t1, s3
+ unop
+ MUL x1, a0, t1
+ LD a0, 8 * SIZE(A1)
+
+ ADD3 s0, t0, s0
+ unop
+ MUL x2, a4, t0
+ LD x1, 5 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ unop
+ MUL x2, a5, t1
+ LD x2, 6 * SIZE(X1)
+
+ ADD1 s2, t0, s2
+ unop
+ MUL x3, a5, t0
+ LD a5, 11 * SIZE(A1)
+
+ ADD2 s3, t1, s3
+ unop
+ MUL x3, a4, t1
+ LD a4, 10 * SIZE(A1)
+
+ ADD3 s0, t0, s0
+ unop
+ MUL x0, a8, t0
+ LD x3, 7 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ unop
+ MUL x0, a9, t1
+ LD x0, 8 * SIZE(X1)
+
+ ADD1 s2, t0, s2
+ unop
+ MUL x1, a9, t0
+ LD a9, 13 * SIZE(A1)
+
+ ADD2 s3, t1, s3
+ unop
+ MUL x1, a8, t1
+ LD a8, 12 * SIZE(A1)
+
+ ADD3 s0, t0, s0
+ unop
+ MUL x2, a12, t0
+ LD x1, 9 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ ldi A1, 8 * SIZE(A1)
+ MUL x2, a13, t1
+ LD x2, 10 * SIZE(X1)
+
+ ADD1 s2, t0, s2
+ ldi X1, 8 * SIZE(X1)
+ MUL x3, a13, t0
+ LD a13, 7 * SIZE(A1)
+
+ ADD2 s3, t1, s3
+ MUL x3, a12, t1
+ LD a12, 6 * SIZE(A1)
+ bgt I, $L22
+ .align 4
+
+$L23:
+ ADD3 s0, t0, s0
+ unop
+ MUL x0, a0, t0
+ LD x3, 3 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ unop
+ MUL x0, a1, t1
+ LD x0, 4 * SIZE(X1)
+
+ ADD1 s2, t0, s2
+ unop
+ MUL x1, a1, t0
+ ldi A1, 8 * SIZE(A1)
+
+ ADD2 s3, t1, s3
+ unop
+ MUL x1, a0, t1
+ LD x1, 5 * SIZE(X1)
+
+ ADD3 s0, t0, s0
+ unop
+ MUL x2, a4, t0
+ unop
+
+ ADD4 s1, t1, s1
+ unop
+ MUL x2, a5, t1
+ LD x2, 6 * SIZE(X1)
+
+ ADD1 s2, t0, s2
+ unop
+ MUL x3, a5, t0
+ ldi X1, 8 * SIZE(X1)
+
+ ADD2 s3, t1, s3
+ unop
+ MUL x3, a4, t1
+ LD x3, -1 * SIZE(X1)
+
+ ADD3 s0, t0, s0
+ MUL x0, a8, t0
+ ADD4 s1, t1, s1
+ MUL x0, a9, t1
+
+ ADD1 s2, t0, s2
+ MUL x1, a9, t0
+ ADD2 s3, t1, s3
+ MUL x1, a8, t1
+
+ ADD3 s0, t0, s0
+ MUL x2, a12, t0
+ ADD4 s1, t1, s1
+ MUL x2, a13, t1
+
+ ADD1 s2, t0, s2
+ MUL x3, a13, t0
+ ADD2 s3, t1, s3
+ MUL x3, a12, t1
+ .align 4
+
+$L25:
+ and M, 3, I
+ ble I, $L28
+
+ LD a0, 0 * SIZE(A1)
+ LD a1, 1 * SIZE(A1)
+
+ LD x0, 0 * SIZE(X1)
+
+ ldi I, -1(I)
+ ble I, $L27
+ .align 4
+
+$L26:
+ ADD3 s0, t0, s0
+ ldi A1, 2 * SIZE(A1)
+ MUL x0, a0, t0
+ LD x1, 1 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ ldi I, -1(I)
+ MUL x0, a1, t1
+ LD x0, 2 * SIZE(X1)
+
+ ADD1 s0, t0, s0
+ ldi X1, 2 * SIZE(X1)
+ MUL x1, a1, t0
+ LD a1, 1 * SIZE(A1)
+
+ ADD2 s1, t1, s1
+ MUL x1, a0, t1
+ LD a0, 0 * SIZE(A1)
+ bgt I, $L26
+ .align 4
+
+$L27:
+ ADD3 s0, t0, s0
+ unop
+ MUL x0, a0, t0
+ LD x1, 1 * SIZE(X1)
+
+ ADD4 s1, t1, s1
+ unop
+ MUL x0, a1, t1
+ unop
+
+ ADD1 s0, t0, s0
+ MUL x1, a1, t0
+ ADD2 s1, t1, s1
+ MUL x1, a0, t1
+ .align 4
+
+$L28:
+ LD a0, 0 * SIZE(Y)
+ LD a1, 1 * SIZE(Y)
+
+ ADD3 s0, t0, s0
+ ADD4 s1, t1, s1
+ ADD3 s2, t2, s2
+ ADD4 s3, t3, s3
+
+ ADD s0, s2, s0
+ ADD s1, s3, s1
+
+ MUL alpha_r, s0, t0
+ MUL alpha_r, s1, t1
+
+ ADD a0, t0, a0
+ MUL alpha_i, s1, t0
+ ADD a1, t1, a1
+ MUL alpha_i, s0, t1
+
+ SUB a0, t0, a0
+ ADD a1, t1, a1
+
+ ST a0, 0 * SIZE(Y1)
+ ST a1, 1 * SIZE(Y1)
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+
+ ldi $sp, STACKSIZE($sp)
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/znrm2.S b/kernel/sw_64/znrm2.S
new file mode 100644
index 0000000..c1b7375
--- /dev/null
+++ b/kernel/sw_64/znrm2.S
@@ -0,0 +1,441 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCH_SIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+#if defined(EV4) || defined(EV5)
+ .frame $30,16,$26,0
+ .mask 0x4000000,-16
+ ldih $29, 0($27) !gpdisp!1
+ ldi $29, 0($29) !gpdisp!1
+
+ ldi $sp, -16($sp)
+ ldl $27, sqrt($29) !literal!2
+ stl $26, 0($sp)
+
+ PROFCODE
+ .prologue 1
+#else
+ PROFCODE
+#endif
+
+ fclr a0
+ sll INCX, ZBASE_SHIFT, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, 2 * SIZE, $0
+ fclr a3
+ beq $0, $L20
+
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ faddd a0, t0, $f25
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, $f26
+ mov X, XX
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, $f27
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, $f28
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd $f25, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ faddd $f26, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ faddd $f27, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ faddd $f28, t3, a3
+ unop
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ faddd a0, t0, $f25
+ unop
+ fmuld x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ faddd a1, t1, $f26
+ ldi X, 16 * SIZE(X)
+ fmuld x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ faddd a2, t2, $f27
+ unop
+ fmuld x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ faddd a3, t3, $f28
+ unop
+ fmuld x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ faddd $f25, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ faddd $f26, t1, a1
+ ldi I, -1(I)
+ fmuld x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ faddd $f27, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ faddd $f28, t3, a3
+ fmuld x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ faddd a0, t0, $f25
+ mov X, XX
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, $f26
+ unop
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, $f27
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, $f28
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd $f25, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ faddd $f26, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ faddd $f27, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ faddd $f28, t3, a3
+ ldi X, 16 * SIZE(X)
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ faddd a0, t0, $f25
+ fmuld x0, x0, t0
+ faddd a1, t1, $f26
+ fmuld x1, x1, t1
+
+ faddd a2, t2, $f27
+ fmuld x2, x2, t2
+ faddd a3, t3, $f28
+ fmuld x3, x3, t3
+
+ faddd $f25, t0, a0
+ fmuld x4, x4, t0
+ faddd $f26, t1, a1
+ fmuld x5, x5, t1
+
+ faddd $f27, t2, a2
+ fmuld x6, x6, t2
+ faddd $f28, t3, a3
+ fmuld x7, x7, t3
+
+ faddd a2, t2, $f27
+ fmov $f27, a2
+ faddd a3, t3, $f28
+ fmov $f28, a3
+ .align 4
+
+$L15:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ LD x1, 1 * SIZE(X)
+
+ ldi X, 2 * SIZE(X)
+
+ faddd a0, t0, $f25
+ fmov $f25, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, $f26
+ fmov $f26, a1
+ fmuld x1, x1, t1
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 2, I
+ fclr t1
+ ble I, $L25
+
+ LD x0, 0 * SIZE(X)
+ fclr t2
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ fclr t3
+ LD x3, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x5, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x6, 0 * SIZE(X)
+ ble I, $L22
+ .align 4
+
+$L21:
+ faddd a0, t0, $f25
+ LD x7, 1 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, $f26
+ LD x0, 0 * SIZE(X)
+ fmuld x1, x1, t1
+ unop
+
+ faddd a2, t2, $f27
+ LD x1, 1 * SIZE(X)
+ fmuld x2, x2, t2
+ addl X, INCX, X
+
+ faddd a3, t3, $f28
+ LD x2, 0 * SIZE(X)
+ fmuld x3, x3, t3
+ unop
+
+ faddd $f25, t0, a0
+ LD x3, 1 * SIZE(X)
+ fmuld x4, x4, t0
+ addl X, INCX, X
+
+ faddd $f26, t1, a1
+ LD x4, 0 * SIZE(X)
+ fmuld x5, x5, t1
+ ldi I, -1(I)
+
+ faddd $f27, t2, a2
+ LD x5, 1 * SIZE(X)
+ fmuld x6, x6, t2
+ addl X, INCX, X
+
+ faddd $f28, t3, a3
+ LD x6, 0 * SIZE(X)
+ fmuld x7, x7, t3
+ bgt I, $L21
+ .align 4
+
+$L22:
+ faddd a0, t0, $f25
+ LD x7, 1 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, $f26
+ fmuld x1, x1, t1
+ faddd a2, t2, $f27
+ fmuld x2, x2, t2
+
+ faddd a3, t3, $f28
+ fmuld x3, x3, t3
+ faddd $f25, t0, a0
+ fmuld x4, x4, t0
+
+ faddd $f26, t1, a1
+ fmuld x5, x5, t1
+ faddd $f27, t2, a2
+ fmuld x6, x6, t2
+
+ faddd $f28, t3, a3
+ fmuld x7, x7, t3
+
+ faddd a2, t2, $f27
+ fmov $f27, a2
+ faddd a3, t3, $f28
+ fmov $f28, a3
+ .align 4
+
+$L25:
+ and N, 3, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+
+ faddd a0, t0, $f25
+ fmov $f25, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, $f26
+ fmov $f26, a1
+ fmuld x1, x1, t1
+
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ faddd a0, t0, $f25
+ faddd a1, t1, $f26
+ fmov $f25, a0
+ fmov $f26, a1
+
+ faddd a0, a1, $f25
+ fmov $f25, a0
+ faddd a2, a3, $f26
+ fmov $f26, a2
+
+#if defined(EV4) || defined(EV5)
+ faddd a0, a2, $f16
+ jsr $26, ($27), sqrt !lituse_jsr!2
+
+ ldih $29, 0($26) !gpdisp!3
+ ldi $29, 0($29) !gpdisp!3
+#else
+ faddd a0, a2, $f25
+ fmov $f25, a0
+ fsqrtd a0, $f25
+ fmov $f25, a0
+#endif
+ .align 4
+
+$L999:
+#if defined(EV4) || defined(EV5)
+ ldl $26, 0($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/znrm2.S.bak b/kernel/sw_64/znrm2.S.bak
new file mode 100644
index 0000000..b2e80e0
--- /dev/null
+++ b/kernel/sw_64/znrm2.S.bak
@@ -0,0 +1,426 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCH_SIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+#if defined(EV4) || defined(EV5)
+ .frame $30,16,$26,0
+ .mask 0x4000000,-16
+ ldih $29, 0($27) !gpdisp!1
+ ldi $29, 0($29) !gpdisp!1
+
+ ldi $sp, -16($sp)
+ ldl $27, sqrt($29) !literal!2
+ stq $26, 0($sp)
+
+ PROFCODE
+ .prologue 1
+#else
+ PROFCODE
+#endif
+
+ fclr a0
+ sll INCX, ZBASE_SHIFT, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, 2 * SIZE, $0
+ fclr a3
+ beq $0, $L20
+
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ faddd a0, t0, a0
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, a1
+ mov X, XX
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ faddd a1, t1, a1
+ ldi X, 16 * SIZE(X)
+ fmuld x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ faddd a1, t1, a1
+ ldi I, -1(I)
+ fmuld x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ faddd a0, t0, a0
+ mov X, XX
+ fmuld x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ faddd a3, t3, a3
+ unop
+ fmuld x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ faddd a0, t0, a0
+ unop
+ fmuld x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ faddd a1, t1, a1
+ unop
+ fmuld x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ faddd a2, t2, a2
+ unop
+ fmuld x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ faddd a3, t3, a3
+ ldi X, 16 * SIZE(X)
+ fmuld x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+
+ faddd a2, t2, a2
+ fmuld x2, x2, t2
+ faddd a3, t3, a3
+ fmuld x3, x3, t3
+
+ faddd a0, t0, a0
+ fmuld x4, x4, t0
+ faddd a1, t1, a1
+ fmuld x5, x5, t1
+
+ faddd a2, t2, a2
+ fmuld x6, x6, t2
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+
+ faddd a2, t2, a2
+ faddd a3, t3, a3
+ .align 4
+
+$L15:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ LD x1, 1 * SIZE(X)
+
+ ldi X, 2 * SIZE(X)
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 2, I
+ fclr t1
+ ble I, $L25
+
+ LD x0, 0 * SIZE(X)
+ fclr t2
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ fclr t3
+ LD x3, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x5, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x6, 0 * SIZE(X)
+ ble I, $L22
+ .align 4
+
+$L21:
+ faddd a0, t0, a0
+ LD x7, 1 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ LD x0, 0 * SIZE(X)
+ fmuld x1, x1, t1
+ unop
+
+ faddd a2, t2, a2
+ LD x1, 1 * SIZE(X)
+ fmuld x2, x2, t2
+ addl X, INCX, X
+
+ faddd a3, t3, a3
+ LD x2, 0 * SIZE(X)
+ fmuld x3, x3, t3
+ unop
+
+ faddd a0, t0, a0
+ LD x3, 1 * SIZE(X)
+ fmuld x4, x4, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ LD x4, 0 * SIZE(X)
+ fmuld x5, x5, t1
+ ldi I, -1(I)
+
+ faddd a2, t2, a2
+ LD x5, 1 * SIZE(X)
+ fmuld x6, x6, t2
+ addl X, INCX, X
+
+ faddd a3, t3, a3
+ LD x6, 0 * SIZE(X)
+ fmuld x7, x7, t3
+ bgt I, $L21
+ .align 4
+
+$L22:
+ faddd a0, t0, a0
+ LD x7, 1 * SIZE(X)
+ fmuld x0, x0, t0
+ addl X, INCX, X
+
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+ faddd a2, t2, a2
+ fmuld x2, x2, t2
+
+ faddd a3, t3, a3
+ fmuld x3, x3, t3
+ faddd a0, t0, a0
+ fmuld x4, x4, t0
+
+ faddd a1, t1, a1
+ fmuld x5, x5, t1
+ faddd a2, t2, a2
+ fmuld x6, x6, t2
+
+ faddd a3, t3, a3
+ fmuld x7, x7, t3
+ faddd a2, t2, a2
+ faddd a3, t3, a3
+ .align 4
+
+$L25:
+ and N, 3, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+
+ faddd a0, t0, a0
+ fmuld x0, x0, t0
+ faddd a1, t1, a1
+ fmuld x1, x1, t1
+
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ faddd a0, t0, a0
+ faddd a1, t1, a1
+
+ faddd a0, a1, a0
+ faddd a2, a3, a2
+
+#if defined(EV4) || defined(EV5)
+ faddd a0, a2, $f16
+ jsr $26, ($27), sqrt !lituse_jsr!2
+
+ ldih $29, 0($26) !gpdisp!3
+ ldi $29, 0($29) !gpdisp!3
+#else
+ faddd a0, a2, a0
+ fsqrtd a0, a0
+#endif
+ .align 4
+
+$L999:
+#if defined(EV4) || defined(EV5)
+ ldl $26, 0($sp)
+ ldi $sp, 16($sp)
+#endif
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/znrm2_simd.S b/kernel/sw_64/znrm2_simd.S
new file mode 100644
index 0000000..5a509d4
--- /dev/null
+++ b/kernel/sw_64/znrm2_simd.S
@@ -0,0 +1,492 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 80
+
+#define N $16
+#define X $17
+#define INCX $18
+#define XX $19
+
+#define I $0
+
+#define a0 $f0
+#define a1 $f1
+#define a2 $f10
+#define a3 $f11
+#define t0 $f12
+#define t1 $f13
+#define t2 $f14
+#define t3 $f15
+
+#define x0 $f16
+#define x1 $f17
+#define x2 $f18
+#define x3 $f19
+#define x4 $f20
+#define x5 $f21
+#define x6 $f22
+#define x7 $f23
+
+ PROLOGUE
+
+ PROFCODE
+
+ fclr a0
+ sll INCX, ZBASE_SHIFT, INCX
+ fclr a1
+ ble N, $L999
+
+ fclr a2
+ cmpeq INCX, 2 * SIZE, $0
+ fclr a3
+ beq $0, $L20 #stride access
+
+
+/* test the address of X */
+ and X, (VEC_LEN*SIZE-1), $3
+ fclr t0
+ fclr t1
+ bne $3, $UnAlign_ACCESS
+/*Align access. Use simd instructions. Unloop 8 complex*/
+ sra N, 3, I
+ ble I, $Remain
+
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t0 #clear s0 vector
+ VLD a1, 1*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t1
+
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t2
+ VLD a3, 3*VEC_LEN*SIZE(X)
+ vcpys $f31, $f31, t3
+
+ addl X, 16 * SIZE, X
+ subl I, 1, I
+ nop
+ ble I, $MainLoopEnd
+$MainLoop:
+ fillcs PREFETCHSIZE * SIZE(X)
+ VMAD a0, a0, t0, t0
+ subl I, 1, I
+ VMAD a1, a1, t1, t1
+
+ addl X, 16 * SIZE, X
+ VMAD a2, a2, t2, t2
+ nop
+ VMAD a3, a3, t3, t3
+
+ VLD a0, -4*VEC_LEN*SIZE(X)
+ VLD a1, -3*VEC_LEN*SIZE(X)
+ VLD a2, -2*VEC_LEN*SIZE(X)
+ VLD a3, -1*VEC_LEN*SIZE(X)
+
+ bgt I, $MainLoop
+ .align 4
+$MainLoopEnd:
+ VMAD a0, a0, t0, t0
+ VMAD a1, a1, t1, t1
+ VMAD a2, a2, t2, t2
+ VMAD a3, a3, t3, t3
+
+ VADD t0, t1, a0
+ VADD t2, t3, a1
+ nop
+ VADD a0, a1, t0
+
+ vextf t0, 1, t1
+ vextf t0, 2, t2
+ vextf t0, 3, t3
+ nop
+
+ ADD t0, t1, a2
+ ADD t2, t3, a3
+ fclr t1
+ ADD a2, a3, t0
+
+ .align 4
+$Remain:
+ and N, 7, I
+ ble I, $End
+ .align 4
+$RemainLoop:
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+
+ addl X, 2*SIZE, X
+ MAD a0, a0, t0, t0
+ subl I, 1, I
+ MAD a1, a1, t1, t1
+
+ bgt I, $RemainLoop
+ .align 4
+
+ ADD t0, t1, t0
+$End:
+ SQRT t0, a0
+ ret
+ .align 4
+
+$UnAlign_ACCESS:
+
+ fclr t0
+ sra N, 3, I
+ fclr t1
+ ble I, $L15
+
+ fclr t2
+ LD x0, 0 * SIZE(X)
+ fclr t3
+ LD x1, 1 * SIZE(X)
+
+ LD x2, 2 * SIZE(X)
+ LD x3, 3 * SIZE(X)
+ LD x4, 4 * SIZE(X)
+ LD x5, 5 * SIZE(X)
+ LD x6, 6 * SIZE(X)
+ LD x7, 7 * SIZE(X)
+
+ ldi I, -1(I)
+ ble I, $L12
+ .align 4
+
+$L11:
+ ADD a0, t0, a0
+ fillcs (PREFETCHSIZE) * SIZE(X)
+ MUL x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ ADD a1, t1, a1
+ mov X, XX
+ MUL x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ ADD a2, t2, a2
+ unop
+ MUL x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ ADD a3, t3, a3
+ unop
+ MUL x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ ADD a0, t0, a0
+ unop
+ MUL x4, x4, t0
+ LD x4, 12 * SIZE(X)
+
+ ADD a1, t1, a1
+ unop
+ MUL x5, x5, t1
+ LD x5, 13 * SIZE(X)
+
+ ADD a2, t2, a2
+ unop
+ MUL x6, x6, t2
+ LD x6, 14 * SIZE(X)
+
+ ADD a3, t3, a3
+ unop
+ MUL x7, x7, t3
+ LD x7, 15 * SIZE(X)
+
+ ADD a0, t0, a0
+ unop
+ MUL x0, x0, t0
+ LD x0, 16 * SIZE(X)
+
+ ADD a1, t1, a1
+ ldi X, 16 * SIZE(X)
+ MUL x1, x1, t1
+ LD x1, 17 * SIZE(XX)
+
+ ADD a2, t2, a2
+ unop
+ MUL x2, x2, t2
+ LD x2, 18 * SIZE(XX)
+
+ ADD a3, t3, a3
+ unop
+ MUL x3, x3, t3
+ LD x3, 19 * SIZE(XX)
+
+ ADD a0, t0, a0
+ unop
+ MUL x4, x4, t0
+ LD x4, 20 * SIZE(XX)
+
+ ADD a1, t1, a1
+ ldi I, -1(I)
+ MUL x5, x5, t1
+ LD x5, 21 * SIZE(XX)
+
+ ADD a2, t2, a2
+ unop
+ MUL x6, x6, t2
+ LD x6, 22 * SIZE(XX)
+
+ ADD a3, t3, a3
+ MUL x7, x7, t3
+ LD x7, 23 * SIZE(XX)
+ bgt I, $L11
+ .align 4
+
+$L12:
+ ADD a0, t0, a0
+ mov X, XX
+ MUL x0, x0, t0
+ LD x0, 8 * SIZE(X)
+
+ ADD a1, t1, a1
+ unop
+ MUL x1, x1, t1
+ LD x1, 9 * SIZE(X)
+
+ ADD a2, t2, a2
+ unop
+ MUL x2, x2, t2
+ LD x2, 10 * SIZE(X)
+
+ ADD a3, t3, a3
+ unop
+ MUL x3, x3, t3
+ LD x3, 11 * SIZE(X)
+
+ ADD a0, t0, a0
+ unop
+ MUL x4, x4, t0
+ LD x4, 12 * SIZE(XX)
+
+ ADD a1, t1, a1
+ unop
+ MUL x5, x5, t1
+ LD x5, 13 * SIZE(XX)
+
+ ADD a2, t2, a2
+ unop
+ MUL x6, x6, t2
+ LD x6, 14 * SIZE(XX)
+
+ ADD a3, t3, a3
+ ldi X, 16 * SIZE(X)
+ MUL x7, x7, t3
+ LD x7, 15 * SIZE(XX)
+
+ ADD a0, t0, a0
+ MUL x0, x0, t0
+ ADD a1, t1, a1
+ MUL x1, x1, t1
+
+ ADD a2, t2, a2
+ MUL x2, x2, t2
+ ADD a3, t3, a3
+ MUL x3, x3, t3
+
+ ADD a0, t0, a0
+ MUL x4, x4, t0
+ ADD a1, t1, a1
+ MUL x5, x5, t1
+
+ ADD a2, t2, a2
+ MUL x6, x6, t2
+ ADD a3, t3, a3
+ MUL x7, x7, t3
+
+ ADD a2, t2, a2
+ ADD a3, t3, a3
+ .align 4
+
+$L15:
+ and N, 7, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD x0, 0 * SIZE(X)
+ LD x1, 1 * SIZE(X)
+
+ ldi X, 2 * SIZE(X)
+
+ ADD a0, t0, a0
+ MUL x0, x0, t0
+ ADD a1, t1, a1
+ MUL x1, x1, t1
+
+ ldi I, -1(I)
+ bgt I, $L16
+ bsr $31, $L998
+ .align 4
+
+$L20:
+ fclr t0
+ sra N, 2, I
+ fclr t1
+ ble I, $L25
+
+ LD x0, 0 * SIZE(X)
+ fclr t2
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+ LD x2, 0 * SIZE(X)
+ fclr t3
+ LD x3, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x4, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x5, 1 * SIZE(X)
+ addl X, INCX, X
+
+ LD x6, 0 * SIZE(X)
+ ble I, $L22
+ .align 4
+
+$L21:
+ ADD a0, t0, a0
+ LD x7, 1 * SIZE(X)
+ MUL x0, x0, t0
+ addl X, INCX, X
+
+ ADD a1, t1, a1
+ LD x0, 0 * SIZE(X)
+ MUL x1, x1, t1
+ unop
+
+ ADD a2, t2, a2
+ LD x1, 1 * SIZE(X)
+ MUL x2, x2, t2
+ addl X, INCX, X
+
+ ADD a3, t3, a3
+ LD x2, 0 * SIZE(X)
+ MUL x3, x3, t3
+ unop
+
+ ADD a0, t0, a0
+ LD x3, 1 * SIZE(X)
+ MUL x4, x4, t0
+ addl X, INCX, X
+
+ ADD a1, t1, a1
+ LD x4, 0 * SIZE(X)
+ MUL x5, x5, t1
+ ldi I, -1(I)
+
+ ADD a2, t2, a2
+ LD x5, 1 * SIZE(X)
+ MUL x6, x6, t2
+ addl X, INCX, X
+
+ ADD a3, t3, a3
+ LD x6, 0 * SIZE(X)
+ MUL x7, x7, t3
+ bgt I, $L21
+ .align 4
+
+$L22:
+ ADD a0, t0, a0
+ LD x7, 1 * SIZE(X)
+ MUL x0, x0, t0
+ addl X, INCX, X
+
+ ADD a1, t1, a1
+ MUL x1, x1, t1
+ ADD a2, t2, a2
+ MUL x2, x2, t2
+
+ ADD a3, t3, a3
+ MUL x3, x3, t3
+ ADD a0, t0, a0
+ MUL x4, x4, t0
+
+ ADD a1, t1, a1
+ MUL x5, x5, t1
+ ADD a2, t2, a2
+ MUL x6, x6, t2
+
+ ADD a3, t3, a3
+ MUL x7, x7, t3
+ ADD a2, t2, a2
+ ADD a3, t3, a3
+ .align 4
+
+$L25:
+ and N, 3, I
+ ble I, $L998
+ .align 4
+
+$L26:
+ LD x0, 0 * SIZE(X)
+ ldi I, -1(I)
+ LD x1, 1 * SIZE(X)
+ addl X, INCX, X
+
+ ADD a0, t0, a0
+ MUL x0, x0, t0
+ ADD a1, t1, a1
+ MUL x1, x1, t1
+
+ bgt I, $L26
+ .align 4
+
+
+$L998:
+ ADD a0, t0, a0
+ ADD a1, t1, a1
+
+ ADD a0, a1, a0
+ ADD a2, a3, a2
+
+
+
+ ADD a0, a2, a0
+ SQRT a0, a0
+
+ .align 4
+
+$L999:
+
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zrot.S b/kernel/sw_64/zrot.S
new file mode 100644
index 0000000..9016a00
--- /dev/null
+++ b/kernel/sw_64/zrot.S
@@ -0,0 +1,689 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define I $21
+#define XX $23
+#define YY $24
+
+#define b9 $f29
+
+#define C $f10
+#define S $f11
+
+#define PREFETCH_SIZE 80
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fmov $f21, C
+ LD S, 0($sp)
+
+ addl INCX, INCX, INCX
+ addl INCY, INCY, INCY
+
+ cmpeq INCX, 2, $23
+ cmpeq INCY, 2, $24
+ ble N, $L998
+
+ and $23, $24, $23
+ beq $23, $L50
+
+ sra N, 2, I
+ ble I, $L15
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ LD $f16, 2*SIZE(X)
+ LD $f17, 2*SIZE(Y)
+ LD $f18, 3*SIZE(X)
+ LD $f19, 3*SIZE(Y)
+
+ MUL C, $f12, $f21
+ unop
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+
+ LD $f13, 4*SIZE(Y)
+ MUL S, $f12, $f24
+ LD $f12, 4*SIZE(X)
+ MUL C, $f14, $f25
+
+ ldi I, -1(I)
+ MUL S, $f15, $f26
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+ MUL C, $f15, $f27
+
+ LD $f15, 5*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+ ble I, $L13
+ .align 4
+
+$L12:
+ MUL C, $f16, $f21
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ unop
+ LD $f14, 5*SIZE(X)
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+
+ MUL C, $f17, $f23
+ fillcs (PREFETCH_SIZE) * SIZE(Y)
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 6*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 7*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 8*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 8*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 9*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f16, $f21
+ LD $f14, 9*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+
+ MUL C, $f17, $f23
+ LD $f17, 10*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 10*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+
+ MUL C, $f19, $f27
+ LD $f19, 11*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ ldi I, -1(I)
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 11*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 6*SIZE(X)
+ MUL S, $f13, $f22
+ unop
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 12*SIZE(Y)
+ ldi X, 8*SIZE(X)
+ unop
+
+ ST $f24, 6*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 4*SIZE(X)
+ ldi Y, 8*SIZE(Y)
+ unop
+
+ ST $f26, -1*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 5*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, -1*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+ bgt I, $L12
+ .align 4
+
+$L13:
+ MUL C, $f16, $f21
+ LD $f14, 5*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ LD $f16, 6*SIZE(X)
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ LD $f18, 7*SIZE(X)
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f12, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+
+ MUL C, $f13, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ MUL C, $f14, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+
+ MUL C, $f15, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f16, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ ST $f22, 6*SIZE(X)
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+ ST $f24, 6*SIZE(Y)
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ ST $f26, 7*SIZE(X)
+ ldi X, 8*SIZE(X)
+ ST $f28, 7*SIZE(Y)
+ ldi Y, 8*SIZE(Y)
+ .align 4
+
+
+$L15:
+ and N, 3, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ ST $f22, 0*SIZE(X)
+ ST $f24, 0*SIZE(Y)
+ ldi I, -1(I)
+
+ ST $f26, 1*SIZE(X)
+ ldi X, 2 * SIZE(X)
+ ST $f28, 1*SIZE(Y)
+ ldi Y, 2 * SIZE(Y)
+
+ bgt I, $L16
+ .align 4
+
+$L998:
+ clr $0
+ ret
+ .align 4
+
+$L50:
+ mov X, XX
+ mov Y, YY
+
+ sra N, 2, I
+ ble I, $L55
+ .align 4
+
+$L51:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ ldi I, -1(I)
+ bgt I, $L51
+ .align 4
+
+$L55:
+ and N, 3, I
+ ble I, $L999
+ .align 4
+
+$L56:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, b9
+ fmov b9, $f22
+ SUB $f23, $f24, b9
+ fmov b9, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, b9
+ fmov b9, $f26
+ SUB $f27, $f28, b9
+ fmov b9, $f28
+
+ ST $f22, 0*SIZE(X)
+ ST $f24, 0*SIZE(Y)
+ ldi I, -1(I)
+
+ ST $f26, 1*SIZE(X)
+ ST $f28, 1*SIZE(Y)
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ bgt I, $L56
+ .align 4
+
+$L999:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zrot.S.bak b/kernel/sw_64/zrot.S.bak
new file mode 100644
index 0000000..83dd2b1
--- /dev/null
+++ b/kernel/sw_64/zrot.S.bak
@@ -0,0 +1,631 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define I $21
+#define XX $23
+#define YY $24
+
+#define C $f10
+#define S $f11
+
+#define PREFETCH_SIZE 80
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fmov $f21, C
+ LD S, 0($sp)
+
+ addl INCX, INCX, INCX
+ addl INCY, INCY, INCY
+
+ cmpeq INCX, 2, $23
+ cmpeq INCY, 2, $24
+ ble N, $L998
+
+ and $23, $24, $23
+ beq $23, $L50
+
+ sra N, 2, I
+ ble I, $L15
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ LD $f16, 2*SIZE(X)
+ LD $f17, 2*SIZE(Y)
+ LD $f18, 3*SIZE(X)
+ LD $f19, 3*SIZE(Y)
+
+ MUL C, $f12, $f21
+ unop
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+
+ LD $f13, 4*SIZE(Y)
+ MUL S, $f12, $f24
+ LD $f12, 4*SIZE(X)
+ MUL C, $f14, $f25
+
+ ldi I, -1(I)
+ MUL S, $f15, $f26
+ ADD $f21, $f22, $f22
+ MUL C, $f15, $f27
+
+ LD $f15, 5*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f24
+ ble I, $L13
+ .align 4
+
+$L12:
+ MUL C, $f16, $f21
+ fillcs (PREFETCH_SIZE) * SIZE(X)
+ unop
+ LD $f14, 5*SIZE(X)
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ fillcs (PREFETCH_SIZE) * SIZE(Y)
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 6*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 7*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 8*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 8*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 9*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f16, $f21
+ LD $f14, 9*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ LD $f17, 10*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 10*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ LD $f19, 11*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ ldi I, -1(I)
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 11*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 6*SIZE(X)
+ MUL S, $f13, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 12*SIZE(Y)
+ ldi X, 8*SIZE(X)
+ unop
+
+ ST $f24, 6*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 4*SIZE(X)
+ ldi Y, 8*SIZE(Y)
+ unop
+
+ ST $f26, -1*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 5*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, -1*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f24
+ bgt I, $L12
+ .align 4
+
+$L13:
+ MUL C, $f16, $f21
+ LD $f14, 5*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ LD $f16, 6*SIZE(X)
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ LD $f18, 7*SIZE(X)
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f16, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ ST $f22, 6*SIZE(X)
+ ADD $f25, $f26, $f26
+ ST $f24, 6*SIZE(Y)
+ SUB $f27, $f28, $f28
+
+ ST $f26, 7*SIZE(X)
+ ldi X, 8*SIZE(X)
+ ST $f28, 7*SIZE(Y)
+ ldi Y, 8*SIZE(Y)
+ .align 4
+
+
+$L15:
+ and N, 3, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(X)
+ ST $f24, 0*SIZE(Y)
+ ldi I, -1(I)
+
+ ST $f26, 1*SIZE(X)
+ ldi X, 2 * SIZE(X)
+ ST $f28, 1*SIZE(Y)
+ ldi Y, 2 * SIZE(Y)
+
+ bgt I, $L16
+ .align 4
+
+$L998:
+ clr $0
+ ret
+ .align 4
+
+$L50:
+ mov X, XX
+ mov Y, YY
+
+ sra N, 2, I
+ ble I, $L55
+ .align 4
+
+$L51:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ ldi I, -1(I)
+ bgt I, $L51
+ .align 4
+
+$L55:
+ and N, 3, I
+ ble I, $L999
+ .align 4
+
+$L56:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(X)
+ ST $f24, 0*SIZE(Y)
+ ldi I, -1(I)
+
+ ST $f26, 1*SIZE(X)
+ ST $f28, 1*SIZE(Y)
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ bgt I, $L56
+ .align 4
+
+$L999:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zrot_simd.S b/kernel/sw_64/zrot_simd.S
new file mode 100644
index 0000000..9e00ebf
--- /dev/null
+++ b/kernel/sw_64/zrot_simd.S
@@ -0,0 +1,799 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define N $16
+#define X $17
+#define INCX $18
+#define Y $19
+#define INCY $20
+#define I $21
+#define XX $23
+#define YY $24
+
+#define C $f10
+#define S $f11
+
+#define x0 $f12
+#define x1 $f14
+#define x2 $f16
+#define x3 $f18
+
+#define y0 $f13
+#define y1 $f15
+#define y2 $f17
+#define y3 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+#define t4 $f24
+#define t5 $f25
+#define t6 $f26
+#define t7 $f27
+
+#define PREFETCHSIZE 80
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ fmov $f21, C
+ LD S, 0($sp)
+
+ addl INCX, INCX, INCX
+ addl INCY, INCY, INCY
+
+ cmpeq INCX, 2, $23
+ cmpeq INCY, 2, $24
+ ble N, $L998
+
+ and $23, $24, $23
+ beq $23, $L50
+
+/* test the address of X */
+ and X, (VEC_LEN*SIZE-1), $3
+ and Y, (VEC_LEN*SIZE-1), $4
+ or $3, $4, $4
+ bne $4, $UnAlign_ACCESS
+
+/*Align Accessing*/
+ sra N, 3, I
+ ble I, $Remain
+
+ vcpyf C, C
+ vcpyf S, S
+
+ VLD x0, 0*VEC_LEN*SIZE(X)
+ VLD x1, 1*VEC_LEN*SIZE(X)
+ VLD x2, 2*VEC_LEN*SIZE(X)
+ VLD x3, 3*VEC_LEN*SIZE(X)
+
+ VLD y0, 0*VEC_LEN*SIZE(Y)
+ VLD y1, 1*VEC_LEN*SIZE(Y)
+ VLD y2, 2*VEC_LEN*SIZE(Y)
+ VLD y3, 3*VEC_LEN*SIZE(Y)
+
+ addl X, 16 * SIZE, X
+ addl Y, 16 * SIZE, Y
+ subl I, 1, I
+ ble I, $MainLoopEnd
+ .align 4
+
+$MainLoop:
+ VMUL C, x0, t0
+ fillcs (PREFETCHSIZE) * SIZE(X)
+ VMUL C, x1, t1
+ fillcs (PREFETCHSIZE) * SIZE(Y)
+
+ VMUL C, x2, t2
+ subl I, 1, I
+ VMUL C, x3, t3
+ nop
+
+ VMUL S, x0, t4
+ VLD x0, 0*VEC_LEN*SIZE(X)
+ VMUL S, x1, t5
+ VLD x1, 1*VEC_LEN*SIZE(X)
+
+ VMUL S, x2, t6
+ VLD x2, 2*VEC_LEN*SIZE(X)
+ VMUL S, x3, t7
+ VLD x3, 3*VEC_LEN*SIZE(X)
+
+ VMAD S, y0, t0, t0
+ VMAD S, y1, t1, t1
+ VMAD S, y2, t2, t2
+ VMAD S, y3, t3, t3
+
+ VMSUB C, y0, t4, t4
+ VLD y0, 0*VEC_LEN*SIZE(Y)
+ VMSUB C, y1, t5, t5
+ VLD y1, 1*VEC_LEN*SIZE(Y)
+
+ VMSUB C, y2, t6, t6
+ VLD y2, 2*VEC_LEN*SIZE(Y)
+ VMSUB C, y3, t7, t7
+ VLD y3, 3*VEC_LEN*SIZE(Y)
+
+ VST t0, -4*VEC_LEN*SIZE(X)
+ VST t1, -3*VEC_LEN*SIZE(X)
+ VST t2, -2*VEC_LEN*SIZE(X)
+ VST t3, -1*VEC_LEN*SIZE(X)
+
+ VST t4, -4*VEC_LEN*SIZE(Y)
+ VST t5, -3*VEC_LEN*SIZE(Y)
+ VST t6, -2*VEC_LEN*SIZE(Y)
+ VST t7, -1*VEC_LEN*SIZE(Y)
+
+ addl X, 16 * SIZE, X
+ addl Y, 16 * SIZE, Y
+ nop
+ bgt I, $MainLoop
+ .align 4
+$MainLoopEnd:
+ VMUL C, x0, t0
+ VMUL C, x1, t1
+ VMUL C, x2, t2
+ VMUL C, x3, t3
+
+ VMUL S, x0, t4
+ VMUL S, x1, t5
+ VMUL S, x2, t6
+ VMUL S, x3, t7
+
+ VMAD S, y0, t0, t0
+ VMAD S, y1, t1, t1
+ VMAD S, y2, t2, t2
+ VMAD S, y3, t3, t3
+
+ VMSUB C, y0, t4, t4
+ VMSUB C, y1, t5, t5
+ VMSUB C, y2, t6, t6
+ VMSUB C, y3, t7, t7
+
+ VST t0, -4*VEC_LEN*SIZE(X)
+ VST t1, -3*VEC_LEN*SIZE(X)
+ VST t2, -2*VEC_LEN*SIZE(X)
+ VST t3, -1*VEC_LEN*SIZE(X)
+
+ VST t4, -4*VEC_LEN*SIZE(Y)
+ VST t5, -3*VEC_LEN*SIZE(Y)
+ VST t6, -2*VEC_LEN*SIZE(Y)
+ VST t7, -1*VEC_LEN*SIZE(Y)
+
+ .align 4
+$Remain:
+ and N, 7, I
+ ble I, $End
+$RemainLoop:
+ LD x0, 0*SIZE(X)
+ LD y0, 0*SIZE(Y)
+ LD x1, 1*SIZE(X)
+ LD y1, 1*SIZE(Y)
+
+ MUL C, x0, t0
+ MUL S, x0, t4
+ MAD S, y0, t0, t0
+ MSUB C, y0, t4, t4
+
+ MUL C, x1, t1
+ ldi I, -1(I)
+ MUL S, x1, t5
+ ldi X, 2 * SIZE(X)
+
+ MAD S, y1, t1, t1
+ ldi Y, 2 * SIZE(Y)
+ MSUB C, y1, t5, t5
+ nop
+
+ ST t0, -2*SIZE(X)
+ ST t1, -1*SIZE(X)
+ ST t4, -2*SIZE(Y)
+ ST t5, -1*SIZE(Y)
+
+ bgt I, $RemainLoop
+ .align 4
+$End:
+ clr $0
+ ret
+ .align 4
+
+$UnAlign_ACCESS:
+ sra N, 2, I
+ ble I, $L15
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ LD $f16, 2*SIZE(X)
+ LD $f17, 2*SIZE(Y)
+ LD $f18, 3*SIZE(X)
+ LD $f19, 3*SIZE(Y)
+
+ MUL C, $f12, $f21
+ unop
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+
+ LD $f13, 4*SIZE(Y)
+ MUL S, $f12, $f24
+ LD $f12, 4*SIZE(X)
+ MUL C, $f14, $f25
+
+ ldi I, -1(I)
+ MUL S, $f15, $f26
+ ADD $f21, $f22, $f22
+ MUL C, $f15, $f27
+
+ LD $f15, 5*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f24
+ ble I, $L13
+ .align 4
+
+$L12:
+ MUL C, $f16, $f21
+ fillcs (PREFETCHSIZE) * SIZE(X)
+ unop
+ LD $f14, 5*SIZE(X)
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ fillcs (PREFETCHSIZE) * SIZE(Y)
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 6*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 7*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 8*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 8*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 9*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f16, $f21
+ LD $f14, 9*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ LD $f17, 10*SIZE(Y)
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ LD $f16, 10*SIZE(X)
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ LD $f19, 11*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ ldi I, -1(I)
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ LD $f18, 11*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 6*SIZE(X)
+ MUL S, $f13, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ LD $f13, 12*SIZE(Y)
+ ldi X, 8*SIZE(X)
+ unop
+
+ ST $f24, 6*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ LD $f12, 4*SIZE(X)
+ ldi Y, 8*SIZE(Y)
+ unop
+
+ ST $f26, -1*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ LD $f15, 5*SIZE(Y)
+ unop
+ unop
+
+ ST $f28, -1*SIZE(Y)
+ MUL S, $f14, $f28
+ SUB $f23, $f24, $f24
+ bgt I, $L12
+ .align 4
+
+$L13:
+ MUL C, $f16, $f21
+ LD $f14, 5*SIZE(X)
+ unop
+ unop
+
+ ST $f22, 0*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ LD $f17, 6*SIZE(Y)
+
+ ST $f24, 0*SIZE(Y)
+ MUL S, $f16, $f24
+ LD $f16, 6*SIZE(X)
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 1*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ LD $f19, 7*SIZE(Y)
+
+ ST $f28, 1*SIZE(Y)
+ MUL S, $f18, $f28
+ LD $f18, 7*SIZE(X)
+ SUB $f23, $f24, $f24
+
+ MUL C, $f12, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 2*SIZE(X)
+ unop
+ MUL S, $f13, $f22
+ ADD $f25, $f26, $f26
+
+ MUL C, $f13, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 2*SIZE(Y)
+ MUL S, $f12, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f14, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 3*SIZE(X)
+ MUL S, $f15, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f15, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 3*SIZE(Y)
+ MUL S, $f14, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ MUL C, $f16, $f21
+ unop
+ unop
+ unop
+
+ ST $f22, 4*SIZE(X)
+ MUL S, $f17, $f22
+ unop
+ ADD $f25, $f26, $f26
+
+ MUL C, $f17, $f23
+ unop
+ unop
+ unop
+
+ ST $f24, 4*SIZE(Y)
+ MUL S, $f16, $f24
+ unop
+ SUB $f27, $f28, $f28
+
+ MUL C, $f18, $f25
+ unop
+ unop
+ unop
+
+ ST $f26, 5*SIZE(X)
+ MUL S, $f19, $f26
+ unop
+ ADD $f21, $f22, $f22
+
+ MUL C, $f19, $f27
+ unop
+ unop
+ unop
+
+ ST $f28, 5*SIZE(Y)
+ MUL S, $f18, $f28
+ unop
+ SUB $f23, $f24, $f24
+
+ ST $f22, 6*SIZE(X)
+ ADD $f25, $f26, $f26
+ ST $f24, 6*SIZE(Y)
+ SUB $f27, $f28, $f28
+
+ ST $f26, 7*SIZE(X)
+ ldi X, 8*SIZE(X)
+ ST $f28, 7*SIZE(Y)
+ ldi Y, 8*SIZE(Y)
+ .align 4
+
+
+$L15:
+ and N, 3, I
+ ble I, $L998
+ .align 4
+
+$L16:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(X)
+ ST $f24, 0*SIZE(Y)
+ ldi I, -1(I)
+
+ ST $f26, 1*SIZE(X)
+ ldi X, 2 * SIZE(X)
+ ST $f28, 1*SIZE(Y)
+ ldi Y, 2 * SIZE(Y)
+
+ bgt I, $L16
+ .align 4
+
+$L998:
+ clr $0
+ ret
+ .align 4
+
+$L50:
+ mov X, XX
+ mov Y, YY
+
+ sra N, 2, I
+ ble I, $L55
+ .align 4
+
+$L51:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ SXADDQ INCX, X, X
+ LD $f15, 1*SIZE(Y)
+ SXADDQ INCY, Y, Y
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(XX)
+ ST $f24, 0*SIZE(YY)
+ ST $f26, 1*SIZE(XX)
+ SXADDQ INCX, XX, XX
+ ST $f28, 1*SIZE(YY)
+ SXADDQ INCY, YY, YY
+
+ ldi I, -1(I)
+ bgt I, $L51
+ .align 4
+
+$L55:
+ and N, 3, I
+ ble I, $L999
+ .align 4
+
+$L56:
+ LD $f12, 0*SIZE(X)
+ LD $f13, 0*SIZE(Y)
+ LD $f14, 1*SIZE(X)
+ LD $f15, 1*SIZE(Y)
+
+ MUL C, $f12, $f21
+ MUL S, $f13, $f22
+ MUL C, $f13, $f23
+ MUL S, $f12, $f24
+
+ ADD $f21, $f22, $f22
+ SUB $f23, $f24, $f24
+
+ MUL C, $f14, $f25
+ MUL S, $f15, $f26
+ MUL C, $f15, $f27
+ MUL S, $f14, $f28
+
+ ADD $f25, $f26, $f26
+ SUB $f27, $f28, $f28
+
+ ST $f22, 0*SIZE(X)
+ ST $f24, 0*SIZE(Y)
+ ldi I, -1(I)
+
+ ST $f26, 1*SIZE(X)
+ ST $f28, 1*SIZE(Y)
+ SXADDQ INCX, X, X
+ SXADDQ INCY, Y, Y
+
+ bgt I, $L56
+ .align 4
+
+$L999:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zscal.S b/kernel/sw_64/zscal.S
new file mode 100644
index 0000000..9589624
--- /dev/null
+++ b/kernel/sw_64/zscal.S
@@ -0,0 +1,255 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $21
+#define INCX $17
+
+#define XX $18
+#define I $19
+
+#define ALPHA_R $f19
+#define ALPHA_I $f20
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f21
+
+#define t0 $f22
+#define t1 $f23
+#define t2 $f24
+#define t3 $f25
+
+#define t4 $f26
+#define t5 $f27
+#define t6 $f28
+#define t7 $f29
+
+ PROLOGUE
+ PROFCODE
+
+ ldl INCX, 0($sp)
+ mov X, XX
+ ble N, $L999
+
+ addl INCX, INCX, INCX
+
+ sra N, 2, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a2, 0 * SIZE(X)
+ LD a3, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a6, 0 * SIZE(X)
+ LD a7, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ MUL a1, ALPHA_I, t1
+ MUL a0, ALPHA_I, t2
+ MUL a1, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ST t4, 0 * SIZE(XX)
+ MUL a2, ALPHA_R, t0
+ ST t5, 1 * SIZE(XX)
+ MUL a3, ALPHA_I, t1
+
+ MUL a2, ALPHA_I, t2
+ LD a0, 0 * SIZE(X)
+ MUL a3, ALPHA_R, t3
+ LD a1, 1 * SIZE(X)
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ SXADDQ INCX, X, X
+
+ MUL a4, ALPHA_R, t0
+ ST t6, 0 * SIZE(XX)
+ MUL a5, ALPHA_I, t1
+ ST t7, 1 * SIZE(XX)
+
+ MUL a4, ALPHA_I, t2
+ LD a2, 0 * SIZE(X)
+ MUL a5, ALPHA_R, t3
+ LD a3, 1 * SIZE(X)
+
+ SUB t0, t1, t4
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t5
+ SXADDQ INCX, X, X
+
+ MUL a6, ALPHA_R, t0
+ ST t4, 0 * SIZE(XX)
+ MUL a7, ALPHA_I, t1
+ ST t5, 1 * SIZE(XX)
+
+ MUL a6, ALPHA_I, t2
+ LD a4, 0 * SIZE(X)
+ MUL a7, ALPHA_R, t3
+ LD a5, 1 * SIZE(X)
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ ST t6, 0 * SIZE(XX)
+ MUL a1, ALPHA_I, t1
+ ST t7, 1 * SIZE(XX)
+
+ MUL a0, ALPHA_I, t2
+ LD a6, 0 * SIZE(X)
+ MUL a1, ALPHA_R, t3
+ LD a7, 1 * SIZE(X)
+
+ SUB t0, t1, t4
+ ldi I, -1(I)
+ ADD t2, t3, t5
+ SXADDQ INCX, XX, XX
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ MUL a2, ALPHA_R, t0
+ MUL a3, ALPHA_I, t1
+ ST t4, 0 * SIZE(XX)
+ MUL a2, ALPHA_I, t2
+ ST t5, 1 * SIZE(XX)
+ MUL a3, ALPHA_R, t3
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ unop
+
+ ST t6, 0 * SIZE(XX)
+ MUL a4, ALPHA_R, t0
+ ST t7, 1 * SIZE(XX)
+ MUL a5, ALPHA_I, t1
+ MUL a4, ALPHA_I, t2
+ MUL a5, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t5
+ unop
+
+ MUL a6, ALPHA_R, t0
+ ST t4, 0 * SIZE(XX)
+ MUL a7, ALPHA_I, t1
+ ST t5, 1 * SIZE(XX)
+
+ MUL a6, ALPHA_I, t2
+ MUL a7, ALPHA_R, t3
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+
+ ST t6, 0 * SIZE(XX)
+ ST t7, 1 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ .align 4
+
+$L15:
+ and N, 3, I
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ MUL a1, ALPHA_I, t1
+ MUL a0, ALPHA_I, t2
+ MUL a1, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+
+ ST t4, 0 * SIZE(XX)
+ ST t5, 1 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+
+ ldi I, -1(I)
+ bne I, $L17
+ .align 4
+
+$L999:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zscal.S.bak b/kernel/sw_64/zscal.S.bak
new file mode 100644
index 0000000..4525b56
--- /dev/null
+++ b/kernel/sw_64/zscal.S.bak
@@ -0,0 +1,443 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $21
+#define INCX $17
+
+#define XX $18
+#define I $19
+
+#define ALPHA_R $f19
+#define ALPHA_I $f20
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f21
+
+#define t0 $f22
+#define t1 $f23
+#define t2 $f24
+#define t3 $f25
+
+#define t4 $f26
+#define t5 $f27
+#define t6 $f28
+#define t7 $f29
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+ ldl INCX, 0($sp)
+ mov X, XX
+ cmpeq INCX, 1, $0
+ ble N, $L999
+
+ beq $0, $Sub
+ nop
+
+/*
+ unloop 4 (4*2=8)
+*/
+ sra N, 2, I
+ ble I, $Remain
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+
+ LD a4, 4 * SIZE(X)
+ LD a5, 5 * SIZE(X)
+
+ LD a6, 6 * SIZE(X)
+ LD a7, 7 * SIZE(X)
+
+
+ MUL a0, ALPHA_R, t0
+ MUL a0, ALPHA_I, t2
+
+ NMAD a1, ALPHA_I, t0, t4
+ MAD a1, ALPHA_R, t2, t5
+/*
+ MUL a1, ALPHA_I, t1
+ MUL a1, ALPHA_R, t3
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+*/
+ ldi I, -1(I)
+ addl X, 8*SIZE, X
+
+ ble I, $MainLoopEnd
+ .align 4
+
+$MainLoop:
+ MUL a2, ALPHA_R, t0
+ ST t4, -8 * SIZE(X)
+ MUL a2, ALPHA_I, t2
+ ST t5, -7 * SIZE(X)
+
+
+ NMAD a3, ALPHA_I, t0, t6
+ LD a0, 0 * SIZE(X)
+ MAD a3, ALPHA_R, t2, t7
+ LD a1, 1 * SIZE(X)
+
+ ST t6, -6 * SIZE(X)
+ MUL a4, ALPHA_R, t0
+ ST t7, -5 * SIZE(X)
+ MUL a4, ALPHA_I, t2
+
+
+ NMAD a5, ALPHA_I, t0, t4
+ LD a2, 2 * SIZE(X)
+ MAD a5, ALPHA_R, t2, t5
+ LD a3, 3 * SIZE(X)
+/*
+ MUL a5, ALPHA_I, t1
+ MUL a5, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+*/
+
+ MUL a6, ALPHA_R, t0
+ ST t4, -4 * SIZE(X)
+ MUL a6, ALPHA_I, t2
+ ST t5, -3 * SIZE(X)
+
+ NMAD a7, ALPHA_I, t0, t6
+ LD a4, 4 * SIZE(X)
+ MAD a7, ALPHA_R, t2, t7
+ LD a5, 5 * SIZE(X)
+/*
+
+ MUL a7, ALPHA_I, t1
+ MUL a7, ALPHA_R, t3
+
+ SUB t0, t1, t6
+ ADD t2, t3, t7
+*/
+ MUL a0, ALPHA_R, t0
+ ST t6, -2 * SIZE(X)
+ MUL a0, ALPHA_I, t2
+ ST t7, -1 * SIZE(X)
+
+ NMAD a1, ALPHA_I, t0, t4
+ LD a6, 6 * SIZE(X)
+ MAD a1, ALPHA_R, t2, t5
+ LD a7, 7 * SIZE(X)
+
+
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ subl I, 1, I
+ addl X, 8*SIZE, X
+ bgt I, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+ MUL a2, ALPHA_R, t0
+ ST t4, -8 * SIZE(X)
+ MUL a2, ALPHA_I, t2
+ ST t5, -7 * SIZE(X)
+
+
+ NMAD a3, ALPHA_I, t0, t6
+ MAD a3, ALPHA_R, t2, t7
+
+
+ ST t6, -6 * SIZE(X)
+ MUL a4, ALPHA_R, t0
+ ST t7, -5 * SIZE(X)
+ MUL a4, ALPHA_I, t2
+
+
+ NMAD a5, ALPHA_I, t0, t4
+ MAD a5, ALPHA_R, t2, t5
+/*
+ MUL a5, ALPHA_I, t1
+ MUL a5, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+*/
+
+ MUL a6, ALPHA_R, t0
+ ST t4, -4 * SIZE(X)
+ MUL a6, ALPHA_I, t2
+ ST t5, -3 * SIZE(X)
+
+ NMAD a7, ALPHA_I, t0, t6
+ MAD a7, ALPHA_R, t2, t7
+/*
+
+ MUL a7, ALPHA_I, t1
+ MUL a7, ALPHA_R, t3
+
+ SUB t0, t1, t6
+ ADD t2, t3, t7
+*/
+ ST t6, -2 * SIZE(X)
+ ST t7, -1 * SIZE(X)
+
+ .align 4
+$Remain:
+ and N, 3, I
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$RemainLoop:
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+
+
+ MUL a0, ALPHA_R, t0
+ MUL a0, ALPHA_I, t2
+
+ NMAD a1, ALPHA_I, t0, t4
+ MAD a1, ALPHA_R, t2, t5
+
+/*
+ MUL a1, ALPHA_I, t1
+ MUL a1, ALPHA_R, t3
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+*/
+ ST t4, 0 * SIZE(X)
+ ST t5, 1 * SIZE(X)
+
+ addl X, 2*SIZE, X
+ ldi I, -1(I)
+ bne I, $RemainLoop
+ nop
+
+ ret
+ .align 4
+
+$Sub:
+ addl INCX, INCX, INCX
+
+ sra N, 2, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a2, 0 * SIZE(X)
+ LD a3, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a6, 0 * SIZE(X)
+ LD a7, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ MUL a1, ALPHA_I, t1
+ MUL a0, ALPHA_I, t2
+ MUL a1, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ST t4, 0 * SIZE(XX)
+ MUL a2, ALPHA_R, t0
+ ST t5, 1 * SIZE(XX)
+ MUL a3, ALPHA_I, t1
+
+ MUL a2, ALPHA_I, t2
+ LD a0, 0 * SIZE(X)
+ MUL a3, ALPHA_R, t3
+ LD a1, 1 * SIZE(X)
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ SXADDQ INCX, X, X
+
+ MUL a4, ALPHA_R, t0
+ ST t6, 0 * SIZE(XX)
+ MUL a5, ALPHA_I, t1
+ ST t7, 1 * SIZE(XX)
+
+ MUL a4, ALPHA_I, t2
+ LD a2, 0 * SIZE(X)
+ MUL a5, ALPHA_R, t3
+ LD a3, 1 * SIZE(X)
+
+ SUB t0, t1, t4
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t5
+ SXADDQ INCX, X, X
+
+ MUL a6, ALPHA_R, t0
+ ST t4, 0 * SIZE(XX)
+ MUL a7, ALPHA_I, t1
+ ST t5, 1 * SIZE(XX)
+
+ MUL a6, ALPHA_I, t2
+ LD a4, 0 * SIZE(X)
+ MUL a7, ALPHA_R, t3
+ LD a5, 1 * SIZE(X)
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ ST t6, 0 * SIZE(XX)
+ MUL a1, ALPHA_I, t1
+ ST t7, 1 * SIZE(XX)
+
+ MUL a0, ALPHA_I, t2
+ LD a6, 0 * SIZE(X)
+ MUL a1, ALPHA_R, t3
+ LD a7, 1 * SIZE(X)
+
+ SUB t0, t1, t4
+ ldi I, -1(I)
+ ADD t2, t3, t5
+ SXADDQ INCX, XX, XX
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ MUL a2, ALPHA_R, t0
+ MUL a3, ALPHA_I, t1
+ ST t4, 0 * SIZE(XX)
+ MUL a2, ALPHA_I, t2
+ ST t5, 1 * SIZE(XX)
+ MUL a3, ALPHA_R, t3
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ unop
+
+ ST t6, 0 * SIZE(XX)
+ MUL a4, ALPHA_R, t0
+ ST t7, 1 * SIZE(XX)
+ MUL a5, ALPHA_I, t1
+ MUL a4, ALPHA_I, t2
+ MUL a5, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t5
+ unop
+
+ MUL a6, ALPHA_R, t0
+ ST t4, 0 * SIZE(XX)
+ MUL a7, ALPHA_I, t1
+ ST t5, 1 * SIZE(XX)
+
+ MUL a6, ALPHA_I, t2
+ MUL a7, ALPHA_R, t3
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+
+ ST t6, 0 * SIZE(XX)
+ ST t7, 1 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ .align 4
+
+$L15:
+ and N, 3, I
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ MUL a1, ALPHA_I, t1
+ MUL a0, ALPHA_I, t2
+ MUL a1, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+
+ ST t4, 0 * SIZE(XX)
+ ST t5, 1 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+
+ ldi I, -1(I)
+ bne I, $L17
+ .align 4
+
+$L999:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zscal_simd.S b/kernel/sw_64/zscal_simd.S
new file mode 100644
index 0000000..09d2f38
--- /dev/null
+++ b/kernel/sw_64/zscal_simd.S
@@ -0,0 +1,579 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 96
+
+#define N $16
+#define X $21
+#define INCX $17
+
+#define XX $18
+#define I $19
+
+#define ALPHA_R $f19
+#define ALPHA_I $f20
+
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f21
+
+#define t0 $f22
+#define t1 $f23
+#define t2 $f24
+#define t3 $f25
+
+#define t4 $f26
+#define t5 $f27
+#define t6 $f28
+#define t7 $f29
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+ ldl INCX, 0($sp)
+ mov X, XX
+ cmpeq INCX, 1, $0
+ ble N, $L999
+
+ beq $0, $Sub
+ .align 5
+
+ and X, (VEC_LEN*SIZE-1), $6
+ bgt $6, $UnAlign_X_ACCESS
+
+/*
+ Unloop 8 (8*2=16)
+*/
+ sra N, 3, I
+ vcpyf ALPHA_R, ALPHA_R
+ vcpyf ALPHA_I, ALPHA_I
+ ble I, $Remain
+
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ VLD a1, 1*VEC_LEN*SIZE(X)
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ VLD a3, 3*VEC_LEN*SIZE(X)
+
+ subl I, 1, I
+ addl X, 16*SIZE, X
+ ble I, $MainLoopEnd
+ .align 4
+
+
+$MainLoop:
+
+ vextf a0, 1, a4
+ vextf a0, 3, a5
+ vextf a1, 0, a6
+ vextf a1, 2, a7
+
+ vextf a2, 1, t0
+ vextf a2, 3, t1
+ vextf a3, 0, t2
+ vextf a3, 2, t3
+
+ vinsf a4, a1, 0, a1
+ vinsf a5, a1, 2, a1
+ vinsf a6, a0, 1, a0
+ vinsf a7, a0, 3, a0
+
+ vinsf t0, a3, 0, a3
+ vinsf t1, a3, 2, a3
+ vinsf t2, a2, 1, a2
+ vinsf t3, a2, 3, a2
+
+ VMUL ALPHA_R, a0, t4
+ VMUL ALPHA_I, a0, t5
+ VMUL ALPHA_R, a2, t6
+ VMUL ALPHA_I, a2, t7
+
+ VNMAD ALPHA_I, a1, t4, t0
+ VLD a0, 0*VEC_LEN*SIZE(X)
+ VMAD ALPHA_R, a1, t5, t1
+ VLD a1, 1*VEC_LEN*SIZE(X)
+
+ VNMAD ALPHA_I, a3, t6, t2
+ VLD a2, 2*VEC_LEN*SIZE(X)
+ VMAD ALPHA_R, a3, t7, t3
+ VLD a3, 3*VEC_LEN*SIZE(X)
+
+/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/
+ vextf t0, 1, a4
+ vextf t0, 3, a5
+ vextf t1, 0, a6
+ vextf t1, 2, a7
+
+ vextf t2, 1, s0
+ vextf t2, 3, s1
+ vextf t3, 0, s2
+ vextf t3, 2, s3
+
+ vinsf a4, t1, 0, t1
+ vinsf a5, t1, 2, t1
+ vinsf a6, t0, 1, t0
+ vinsf a7, t0, 3, t0
+
+ vinsf s0, t3, 0, t3
+ vinsf s1, t3, 2, t3
+ vinsf s2, t2, 1, t2
+ vinsf s3, t2, 3, t2
+
+ VST t0, -4*VEC_LEN*SIZE(X)
+ VST t1, -3*VEC_LEN*SIZE(X)
+ VST t2, -2*VEC_LEN*SIZE(X)
+ VST t3, -1*VEC_LEN*SIZE(X)
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ subl I, 1, I
+ addl X, 16*SIZE, X
+ bgt I, $MainLoop
+ .align 4
+
+$MainLoopEnd:
+/*spilt the complex vector to real vector($f0) and image vector ($f1)*/
+ vextf a0, 1, a4
+ vextf a0, 3, a5
+ vextf a1, 0, a6
+ vextf a1, 2, a7
+
+ vextf a2, 1, t0
+ vextf a2, 3, t1
+ vextf a3, 0, t2
+ vextf a3, 2, t3
+
+ vinsf a4, a1, 0, a1
+ vinsf a5, a1, 2, a1
+ vinsf a6, a0, 1, a0
+ vinsf a7, a0, 3, a0
+
+ vinsf t0, a3, 0, a3
+ vinsf t1, a3, 2, a3
+ vinsf t2, a2, 1, a2
+ vinsf t3, a2, 3, a2
+
+ VMUL ALPHA_R, a0, t4
+ VMUL ALPHA_I, a0, t5
+ VMUL ALPHA_R, a2, t6
+ VMUL ALPHA_I, a2, t7
+
+ VNMAD ALPHA_I, a1, t4, t0
+ VMAD ALPHA_R, a1, t5, t1
+ VNMAD ALPHA_I, a3, t6, t2
+ VMAD ALPHA_R, a3, t7, t3
+
+/*combine the real(t0,t2) & image(t1,t3) vector to complex vector*/
+ vextf t0, 1, a4
+ vextf t0, 3, a5
+ vextf t1, 0, a6
+ vextf t1, 2, a7
+
+ vextf t2, 1, s0
+ vextf t2, 3, s1
+ vextf t3, 0, s2
+ vextf t3, 2, s3
+
+ vinsf a4, t1, 0, t1
+ vinsf a5, t1, 2, t1
+ vinsf a6, t0, 1, t0
+ vinsf a7, t0, 3, t0
+
+ vinsf s0, t3, 0, t3
+ vinsf s1, t3, 2, t3
+ vinsf s2, t2, 1, t2
+ vinsf s3, t2, 3, t2
+
+ VST t0, -4*VEC_LEN*SIZE(X)
+ VST t1, -3*VEC_LEN*SIZE(X)
+ VST t2, -2*VEC_LEN*SIZE(X)
+ VST t3, -1*VEC_LEN*SIZE(X)
+
+$Remain:
+ and N, 7, I
+ unop
+ unop
+ ble I, $L999
+ .align 5
+
+$Remain_loop:
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+
+ MUL a0, ALPHA_R, t0
+ MUL a1, ALPHA_I, t1
+ MUL a0, ALPHA_I, t2
+ MUL a1, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+ ST t4, 0 * SIZE(X)
+ ST t5, 1 * SIZE(X)
+
+ addl X, 2*SIZE, X
+ ldi I, -1(I)
+ bne I, $Remain_loop
+ ret
+ .align 5
+
+$UnAlign_X_ACCESS:
+/*
+ unloop 4 (4*2=8)
+*/
+ sra N, 2, I
+ ble I, $Unalign_Remain
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+
+ LD a2, 2 * SIZE(X)
+ LD a3, 3 * SIZE(X)
+
+ LD a4, 4 * SIZE(X)
+ MUL a0, ALPHA_R, t0
+ LD a5, 5 * SIZE(X)
+ MUL a0, ALPHA_I, t2
+
+ LD a6, 6 * SIZE(X)
+ NMAD a1, ALPHA_I, t0, t4
+ LD a7, 7 * SIZE(X)
+ MAD a1, ALPHA_R, t2, t5
+
+
+ ldi I, -1(I)
+ addl X, 8*SIZE, X
+ ble I, $Unalign_MainLoopEnd
+ .align 4
+
+$Unalign_MainLoop:
+ MUL a2, ALPHA_R, t0
+ ST t4, -8 * SIZE(X)
+ MUL a2, ALPHA_I, t2
+ ST t5, -7 * SIZE(X)
+
+
+ NMAD a3, ALPHA_I, t0, t6
+ LD a0, 0 * SIZE(X)
+ MAD a3, ALPHA_R, t2, t7
+ LD a1, 1 * SIZE(X)
+
+ ST t6, -6 * SIZE(X)
+ MUL a4, ALPHA_R, t0
+ ST t7, -5 * SIZE(X)
+ MUL a4, ALPHA_I, t2
+
+
+ NMAD a5, ALPHA_I, t0, t4
+ LD a2, 2 * SIZE(X)
+ MAD a5, ALPHA_R, t2, t5
+ LD a3, 3 * SIZE(X)
+
+ MUL a6, ALPHA_R, t0
+ ST t4, -4 * SIZE(X)
+ MUL a6, ALPHA_I, t2
+ ST t5, -3 * SIZE(X)
+
+ NMAD a7, ALPHA_I, t0, t6
+ LD a4, 4 * SIZE(X)
+ MAD a7, ALPHA_R, t2, t7
+ LD a5, 5 * SIZE(X)
+
+ MUL a0, ALPHA_R, t0
+ ST t6, -2 * SIZE(X)
+ MUL a0, ALPHA_I, t2
+ ST t7, -1 * SIZE(X)
+
+ NMAD a1, ALPHA_I, t0, t4
+ LD a6, 6 * SIZE(X)
+ MAD a1, ALPHA_R, t2, t5
+ LD a7, 7 * SIZE(X)
+
+
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ subl I, 1, I
+ addl X, 8*SIZE, X
+ bgt I, $Unalign_MainLoop
+ .align 4
+
+$Unalign_MainLoopEnd:
+ MUL a2, ALPHA_R, t0
+ ST t4, -8 * SIZE(X)
+ MUL a2, ALPHA_I, t2
+ ST t5, -7 * SIZE(X)
+
+
+ NMAD a3, ALPHA_I, t0, t6
+ MAD a3, ALPHA_R, t2, t7
+
+
+ ST t6, -6 * SIZE(X)
+ MUL a4, ALPHA_R, t0
+ ST t7, -5 * SIZE(X)
+ MUL a4, ALPHA_I, t2
+
+
+ NMAD a5, ALPHA_I, t0, t4
+ MAD a5, ALPHA_R, t2, t5
+
+ MUL a6, ALPHA_R, t0
+ ST t4, -4 * SIZE(X)
+ MUL a6, ALPHA_I, t2
+ ST t5, -3 * SIZE(X)
+
+ NMAD a7, ALPHA_I, t0, t6
+ MAD a7, ALPHA_R, t2, t7
+ ST t6, -2 * SIZE(X)
+ ST t7, -1 * SIZE(X)
+
+ .align 4
+$Unalign_Remain:
+ and N, 3, I
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$Unalign_RemainLoop:
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+
+
+ MUL a0, ALPHA_R, t0
+ MUL a0, ALPHA_I, t2
+
+ NMAD a1, ALPHA_I, t0, t4
+ MAD a1, ALPHA_R, t2, t5
+
+ ST t4, 0 * SIZE(X)
+ ST t5, 1 * SIZE(X)
+
+ addl X, 2*SIZE, X
+ ldi I, -1(I)
+ bne I, $Unalign_RemainLoop
+ nop
+
+ ret
+ .align 4
+
+$Sub:
+ addl INCX, INCX, INCX
+
+ sra N, 2, I
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a2, 0 * SIZE(X)
+ LD a3, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ LD a6, 0 * SIZE(X)
+ LD a7, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ MUL a1, ALPHA_I, t1
+ MUL a0, ALPHA_I, t2
+ MUL a1, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+
+ ldi I, -1(I)
+ ble I, $L13
+ .align 4
+
+$L12:
+ ST t4, 0 * SIZE(XX)
+ MUL a2, ALPHA_R, t0
+ ST t5, 1 * SIZE(XX)
+ MUL a3, ALPHA_I, t1
+
+ MUL a2, ALPHA_I, t2
+ LD a0, 0 * SIZE(X)
+ MUL a3, ALPHA_R, t3
+ LD a1, 1 * SIZE(X)
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ SXADDQ INCX, X, X
+
+ MUL a4, ALPHA_R, t0
+ ST t6, 0 * SIZE(XX)
+ MUL a5, ALPHA_I, t1
+ ST t7, 1 * SIZE(XX)
+
+ MUL a4, ALPHA_I, t2
+ LD a2, 0 * SIZE(X)
+ MUL a5, ALPHA_R, t3
+ LD a3, 1 * SIZE(X)
+
+ SUB t0, t1, t4
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t5
+ SXADDQ INCX, X, X
+
+ MUL a6, ALPHA_R, t0
+ ST t4, 0 * SIZE(XX)
+ MUL a7, ALPHA_I, t1
+ ST t5, 1 * SIZE(XX)
+
+ MUL a6, ALPHA_I, t2
+ LD a4, 0 * SIZE(X)
+ MUL a7, ALPHA_R, t3
+ LD a5, 1 * SIZE(X)
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ ST t6, 0 * SIZE(XX)
+ MUL a1, ALPHA_I, t1
+ ST t7, 1 * SIZE(XX)
+
+ MUL a0, ALPHA_I, t2
+ LD a6, 0 * SIZE(X)
+ MUL a1, ALPHA_R, t3
+ LD a7, 1 * SIZE(X)
+
+ SUB t0, t1, t4
+ ldi I, -1(I)
+ ADD t2, t3, t5
+ SXADDQ INCX, XX, XX
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ MUL a2, ALPHA_R, t0
+ MUL a3, ALPHA_I, t1
+ ST t4, 0 * SIZE(XX)
+ MUL a2, ALPHA_I, t2
+ ST t5, 1 * SIZE(XX)
+ MUL a3, ALPHA_R, t3
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+ unop
+
+ ST t6, 0 * SIZE(XX)
+ MUL a4, ALPHA_R, t0
+ ST t7, 1 * SIZE(XX)
+ MUL a5, ALPHA_I, t1
+ MUL a4, ALPHA_I, t2
+ MUL a5, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t5
+ unop
+
+ MUL a6, ALPHA_R, t0
+ ST t4, 0 * SIZE(XX)
+ MUL a7, ALPHA_I, t1
+ ST t5, 1 * SIZE(XX)
+
+ MUL a6, ALPHA_I, t2
+ MUL a7, ALPHA_R, t3
+
+ SUB t0, t1, t6
+ SXADDQ INCX, XX, XX
+ ADD t2, t3, t7
+
+ ST t6, 0 * SIZE(XX)
+ ST t7, 1 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+ .align 4
+
+$L15:
+ and N, 3, I
+ unop
+ unop
+ ble I, $L999
+ .align 4
+
+$L17:
+ LD a0, 0 * SIZE(X)
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ MUL a0, ALPHA_R, t0
+ MUL a1, ALPHA_I, t1
+ MUL a0, ALPHA_I, t2
+ MUL a1, ALPHA_R, t3
+
+ SUB t0, t1, t4
+ ADD t2, t3, t5
+
+ ST t4, 0 * SIZE(XX)
+ ST t5, 1 * SIZE(XX)
+ SXADDQ INCX, XX, XX
+
+ ldi I, -1(I)
+ bne I, $L17
+ .align 4
+
+$L999:
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zsum.S b/kernel/sw_64/zsum.S
new file mode 100644
index 0000000..7b8570c
--- /dev/null
+++ b/kernel/sw_64/zsum.S
@@ -0,0 +1,234 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 88
+
+#define N $16
+#define X $17
+#define INCX $18
+#define I $19
+
+#define s0 $f0
+#define s1 $f1
+#define s2 $f10
+#define s3 $f11
+
+#define a0 $f12
+#define a1 $f13
+#define a2 $f14
+#define a3 $f15
+#define a4 $f16
+#define a5 $f17
+#define a6 $f18
+#define a7 $f19
+
+#define t0 $f20
+#define t1 $f21
+#define t2 $f22
+#define t3 $f23
+
+ PROLOGUE
+ PROFCODE
+
+ fclr s0
+ unop
+ fclr t0
+ addw INCX, INCX, $20
+ mov $20,INCX
+
+ fclr s1
+ unop
+ fclr t1
+ ble N, $L999
+
+ fclr s2
+ sra N, 2, I
+ fclr s3
+ ble I, $L15
+
+ LD a0, 0 * SIZE(X)
+ fclr t2
+ LD a1, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a2, 0 * SIZE(X)
+ fclr t3
+ LD a3, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+
+ LD a4, 0 * SIZE(X)
+ LD a5, 1 * SIZE(X)
+ SXADDQ INCX, X, X
+ ldi I, -1(I)
+
+ ble I, $L13
+ .align 4
+
+$L12:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ ldl $31, PREFETCHSIZE * SIZE(X)
+ fmov a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a6, 0 * SIZE(X)
+ fmov a1, t1
+ unop
+
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ LD a7, 1 * SIZE(X)
+ fmov a2, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ LD a0, 0 * SIZE(X)
+ fmov a3, t3
+ unop
+
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ LD a1, 1 * SIZE(X)
+ fmov a4, t0
+ SXADDQ INCX, X, X
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a2, 0 * SIZE(X)
+ fmov a5, t1
+ unop
+
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ LD a3, 1 * SIZE(X)
+ fmov a6, t2
+ SXADDQ INCX, X, X
+
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ LD a4, 0 * SIZE(X)
+ fmov a7, t3
+ unop
+
+ LD a5, 1 * SIZE(X)
+ unop
+ SXADDQ INCX, X, X
+ bne I, $L12
+ .align 4
+
+$L13:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ LD a6, 0 * SIZE(X)
+ fmov a0, t0
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a7, 1 * SIZE(X)
+ fmov a1, t1
+ SXADDQ INCX, X, X
+
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ fmov a2, t2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ fmov a3, t3
+
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ fmov a4, t0
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ fmov a5, t1
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ fmov a6, t2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+ fmov a7, t3
+
+ ADD s2, t2, $f24
+ fmov $f24,s2
+ ADD s3, t3, $f24
+ fmov $f24,s3
+
+ .align 4
+
+$L15:
+ ADD s0, s2, $f24
+ fmov $f24,s0
+ and N, 3, I
+ ADD s1, s3, $f24
+ fmov $f24,s1
+ ble I, $L999
+ .align 4
+
+$L17:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ LD a0, 0 * SIZE(X)
+ fmov a0, t0
+ ldi I, -1(I)
+
+ ADD s1, t1, $f24
+ fmov $f24,s1
+ LD a1, 1 * SIZE(X)
+ fmov a1, t1
+ SXADDQ INCX, X, X
+
+ bne I, $L17
+ .align 4
+
+$L999:
+ ADD s0, t0, $f24
+ fmov $f24,s0
+ ADD s1, t1, $f24
+ fmov $f24,s1
+
+ ADD s0, s1, $f24
+ fmov $f24,s0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zswap.S.bak b/kernel/sw_64/zswap.S.bak
new file mode 100644
index 0000000..f0b19dd
--- /dev/null
+++ b/kernel/sw_64/zswap.S.bak
@@ -0,0 +1,244 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+ mov $21, $17
+ ldl $18, 0($sp)
+ ldl $19, 8($sp)
+ ldl $20, 16($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ble $16, $SubEnd # if n <= 0 goto $End
+
+ cmpeq $18, 1, $1
+ addl $18, $18, $18
+ cmpeq $20, 1, $2
+ addl $20, $20, $20
+
+ sra $16, 2, $21
+ and $1, $2, $1
+ and $16, 3, $22
+ beq $1, $Sub
+
+ ble $21, $MainRemain
+ .align 4
+
+$MainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f11, 1*SIZE($19)
+ LD $f12, 2*SIZE($19)
+ LD $f13, 3*SIZE($19)
+ LD $f14, 4*SIZE($19)
+ LD $f15, 5*SIZE($19)
+ LD $f16, 6*SIZE($19)
+ LD $f17, 7*SIZE($19)
+
+ LD $f20, 0*SIZE($17)
+ LD $f21, 1*SIZE($17)
+ LD $f22, 2*SIZE($17)
+ LD $f23, 3*SIZE($17)
+ LD $f24, 4*SIZE($17)
+ LD $f25, 5*SIZE($17)
+ LD $f26, 6*SIZE($17)
+ LD $f27, 7*SIZE($17)
+
+ fillcs 16*SIZE($17)
+ unop
+ fillcs 16*SIZE($19)
+ subl $21, 1, $21
+
+ ST $f10, 0*SIZE($17)
+ ST $f11, 1*SIZE($17)
+ ST $f12, 2*SIZE($17)
+ ST $f13, 3*SIZE($17)
+ ST $f14, 4*SIZE($17)
+ ST $f15, 5*SIZE($17)
+ ST $f16, 6*SIZE($17)
+ ST $f17, 7*SIZE($17)
+
+ ST $f20, 0*SIZE($19)
+ ST $f21, 1*SIZE($19)
+ ST $f22, 2*SIZE($19)
+ ST $f23, 3*SIZE($19)
+ ST $f24, 4*SIZE($19)
+ ST $f25, 5*SIZE($19)
+ ST $f26, 6*SIZE($19)
+ ST $f27, 7*SIZE($19)
+
+ ldi $17, 8*SIZE($17)
+ ldi $19, 8*SIZE($19)
+ bgt $21, $MainLoop
+ .align 4
+
+$MainRemain:
+ ble $22, $MainEnd
+ .align 4
+
+$MainRemainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f11, 1*SIZE($19)
+ LD $f20, 0*SIZE($17)
+ LD $f21, 1*SIZE($17)
+
+ ldi $17, 2*SIZE($17)
+ ldi $19, 2*SIZE($19)
+ subl $22, 1, $22
+ ST $f10, -2*SIZE($17)
+ ST $f11, -1*SIZE($17)
+ ST $f20, -2*SIZE($19)
+ ST $f21, -1*SIZE($19)
+ bgt $22, $MainRemainLoop
+ .align 4
+
+$MainEnd:
+ clr $0
+ ret
+ .align 4
+
+$Sub:
+ mov $17, $23
+ mov $19, $24
+ ble $21, $SubRemain
+ .align 4
+
+$SubLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f11, 1*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f12, 0*SIZE($19)
+ LD $f13, 1*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f14, 0*SIZE($19)
+ LD $f15, 1*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f16, 0*SIZE($19)
+ LD $f17, 1*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f20, 0*SIZE($17)
+ LD $f21, 1*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f22, 0*SIZE($17)
+ LD $f23, 1*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f24, 0*SIZE($17)
+ LD $f25, 1*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f26, 0*SIZE($17)
+ LD $f27, 1*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ ST $f10, 0*SIZE($23)
+ ST $f11, 1*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f12, 0*SIZE($23)
+ ST $f13, 1*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f14, 0*SIZE($23)
+ ST $f15, 1*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f16, 0*SIZE($23)
+ ST $f17, 1*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f20, 0*SIZE($24)
+ ST $f21, 1*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f22, 0*SIZE($24)
+ ST $f23, 1*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f24, 0*SIZE($24)
+ ST $f25, 1*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f26, 0*SIZE($24)
+ ST $f27, 1*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ subl $21, 1, $21
+ bgt $21, $SubLoop
+ .align 4
+
+$SubRemain:
+ ble $22, $SubEnd
+ .align 4
+
+$SubRemainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f11, 1*SIZE($19)
+ LD $f20, 0*SIZE($17)
+ LD $f21, 1*SIZE($17)
+
+ subl $22, 1, $22
+
+ ST $f10, 0*SIZE($17)
+ ST $f11, 1*SIZE($17)
+ ST $f20, 0*SIZE($19)
+ ST $f21, 1*SIZE($19)
+
+ SXADDQ $18, $17, $17
+ SXADDQ $20, $19, $19
+ bgt $22, $SubRemainLoop
+ .align 4
+
+$SubEnd:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/zswap.c b/kernel/sw_64/zswap.c
new file mode 100644
index 0000000..ae4760a
--- /dev/null
+++ b/kernel/sw_64/zswap.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+* BLASTEST float : OK
+* BLASTEST double : OK
+* CTEST : OK
+* TEST : OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ FLOAT temp[2];
+ BLASLONG inc_x2;
+ BLASLONG inc_y2;
+
+ if ( n < 0 ) return(0);
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+
+ temp[0] = x[ix] ;
+ temp[1] = x[ix+1] ;
+ x[ix] = y[iy] ;
+ x[ix+1] = y[iy+1] ;
+ y[iy] = temp[0] ;
+ y[iy+1] = temp[1] ;
+
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/sw_64/zswap_simd.S b/kernel/sw_64/zswap_simd.S
new file mode 100644
index 0000000..e49c95b
--- /dev/null
+++ b/kernel/sw_64/zswap_simd.S
@@ -0,0 +1,306 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE 64
+#define X $17
+#define Y $19
+
+ PROLOGUE
+ PROFCODE
+ .frame $sp, 0, $26, 0
+
+ mov $21, $17
+ ldl $18, 0($sp)
+ ldl $19, 8($sp)
+ ldl $20, 16($sp)
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ble $16, $SubEnd # if n <= 0 goto $End
+
+ cmpeq $18, 1, $1
+ addl $18, $18, $18
+ cmpeq $20, 1, $2
+ addl $20, $20, $20
+
+/*
+ Unloop 8 complex, 16 real
+*/
+
+ sra $16, 3, $21
+ and $1, $2, $1
+ and $16, 7, $22
+ beq $1, $Sub
+
+/*
+ test the address of Y & X
+*/
+ and Y, (VEC_LEN*SIZE-1), $4
+ and X, (VEC_LEN*SIZE-1), $3
+ or $3, $4, $4
+ bne $4, $UnAlign_ACCESS
+
+/* align access*/
+
+ ble $21, $MainRemain
+ .align 4
+
+$MainLoop:
+ VLD $f10, 0*VEC_LEN*SIZE(Y)
+ VLD $f11, 1*VEC_LEN*SIZE(Y)
+ VLD $f12, 2*VEC_LEN*SIZE(Y)
+ VLD $f13, 3*VEC_LEN*SIZE(Y)
+
+ VLD $f20, 0*VEC_LEN*SIZE(X)
+ VLD $f21, 1*VEC_LEN*SIZE(X)
+ VLD $f22, 2*VEC_LEN*SIZE(X)
+ VLD $f23, 3*VEC_LEN*SIZE(X)
+
+ fillcs PREFETCHSIZE * SIZE(X)
+ unop
+ fillcs PREFETCHSIZE * SIZE(Y)
+ subl $21, 1, $21
+
+ VST $f10, 0*VEC_LEN*SIZE(X)
+ VST $f11, 1*VEC_LEN*SIZE(X)
+ VST $f12, 2*VEC_LEN*SIZE(X)
+ VST $f13, 3*VEC_LEN*SIZE(X)
+
+ VST $f20, 0*VEC_LEN*SIZE(Y)
+ VST $f21, 1*VEC_LEN*SIZE(Y)
+ VST $f22, 2*VEC_LEN*SIZE(Y)
+ VST $f23, 3*VEC_LEN*SIZE(Y)
+
+ ldi $17, 16*SIZE(X)
+ ldi $19, 16*SIZE(Y)
+ bgt $21, $MainLoop
+ .align 4
+
+ jmp $MainRemain
+ .align 4
+
+$UnAlign_ACCESS:
+ sra $16, 2, $21
+ and $16, 3, $22
+ nop
+ ble $21, $MainRemain
+ .align 4
+$UnAlign_ACCESS_MainLoop:
+
+ LD $f10, 0*SIZE(Y)
+ LD $f11, 1*SIZE(Y)
+ LD $f12, 2*SIZE(Y)
+ LD $f13, 3*SIZE(Y)
+ LD $f14, 4*SIZE(Y)
+ LD $f15, 5*SIZE(Y)
+ LD $f16, 6*SIZE(Y)
+ LD $f17, 7*SIZE(Y)
+
+ LD $f20, 0*SIZE(X)
+ LD $f21, 1*SIZE(X)
+ LD $f22, 2*SIZE(X)
+ LD $f23, 3*SIZE(X)
+ LD $f24, 4*SIZE(X)
+ LD $f25, 5*SIZE(X)
+ LD $f26, 6*SIZE(X)
+ LD $f27, 7*SIZE(X)
+
+ fillcs 16*SIZE(X)
+ unop
+ fillcs 16*SIZE(Y)
+ subl $21, 1, $21
+
+ ST $f10, 0*SIZE(X)
+ ST $f11, 1*SIZE(X)
+ ST $f12, 2*SIZE(X)
+ ST $f13, 3*SIZE(X)
+ ST $f14, 4*SIZE(X)
+ ST $f15, 5*SIZE(X)
+ ST $f16, 6*SIZE(X)
+ ST $f17, 7*SIZE(X)
+
+ ST $f20, 0*SIZE(Y)
+ ST $f21, 1*SIZE(Y)
+ ST $f22, 2*SIZE(Y)
+ ST $f23, 3*SIZE(Y)
+ ST $f24, 4*SIZE(Y)
+ ST $f25, 5*SIZE(Y)
+ ST $f26, 6*SIZE(Y)
+ ST $f27, 7*SIZE(Y)
+
+ ldi X, 8*SIZE(X)
+ ldi Y, 8*SIZE(Y)
+ bgt $21, $UnAlign_ACCESS_MainLoop
+ .align 4
+
+$MainRemain:
+ ble $22, $MainEnd
+ .align 4
+
+$MainRemainLoop:
+ LD $f10, 0*SIZE(Y)
+ LD $f11, 1*SIZE(Y)
+ LD $f20, 0*SIZE(X)
+ LD $f21, 1*SIZE(X)
+
+ ldi X, 2*SIZE(X)
+ ldi Y, 2*SIZE(Y)
+ subl $22, 1, $22
+ ST $f10, -2*SIZE(X)
+ ST $f11, -1*SIZE(X)
+ ST $f20, -2*SIZE(Y)
+ ST $f21, -1*SIZE(Y)
+ bgt $22, $MainRemainLoop
+ .align 4
+
+$MainEnd:
+ clr $0
+ ret
+ .align 4
+
+$Sub:
+ sra $16, 2, $21
+ and $16, 3, $22
+
+ mov $17, $23
+ mov $19, $24
+ ble $21, $SubRemain
+ .align 4
+
+$SubLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f11, 1*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f12, 0*SIZE($19)
+ LD $f13, 1*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f14, 0*SIZE($19)
+ LD $f15, 1*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f16, 0*SIZE($19)
+ LD $f17, 1*SIZE($19)
+ SXADDQ $20, $19, $19
+
+ LD $f20, 0*SIZE($17)
+ LD $f21, 1*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f22, 0*SIZE($17)
+ LD $f23, 1*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f24, 0*SIZE($17)
+ LD $f25, 1*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ LD $f26, 0*SIZE($17)
+ LD $f27, 1*SIZE($17)
+ SXADDQ $18, $17, $17
+
+ ST $f10, 0*SIZE($23)
+ ST $f11, 1*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f12, 0*SIZE($23)
+ ST $f13, 1*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f14, 0*SIZE($23)
+ ST $f15, 1*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f16, 0*SIZE($23)
+ ST $f17, 1*SIZE($23)
+ SXADDQ $18, $23, $23
+
+ ST $f20, 0*SIZE($24)
+ ST $f21, 1*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f22, 0*SIZE($24)
+ ST $f23, 1*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f24, 0*SIZE($24)
+ ST $f25, 1*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ ST $f26, 0*SIZE($24)
+ ST $f27, 1*SIZE($24)
+ SXADDQ $20, $24, $24
+
+ subl $21, 1, $21
+ bgt $21, $SubLoop
+ .align 4
+
+$SubRemain:
+ ble $22, $SubEnd
+ .align 4
+
+$SubRemainLoop:
+ LD $f10, 0*SIZE($19)
+ LD $f11, 1*SIZE($19)
+ LD $f20, 0*SIZE($17)
+ LD $f21, 1*SIZE($17)
+
+ subl $22, 1, $22
+
+ ST $f10, 0*SIZE($17)
+ ST $f11, 1*SIZE($17)
+ ST $f20, 0*SIZE($19)
+ ST $f21, 1*SIZE($19)
+
+ SXADDQ $18, $17, $17
+ SXADDQ $20, $19, $19
+ bgt $22, $SubRemainLoop
+ .align 4
+
+$SubEnd:
+ clr $0
+ ret
+ EPILOGUE
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S b/kernel/sw_64/ztrsm_kernel_2x2_LN.S
new file mode 100644
index 0000000..3a14e58
--- /dev/null
+++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S
@@ -0,0 +1,2593 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 48
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+ .set noat
+ .set noreorder
+ .arch sw6a
+
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+
+#define STACKSIZE 88
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define tmp $9
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+#if defined(LN) || defined(LT)
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#else
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#endif
+
+
+CNAME:
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $at, _mcount
+ jsr $at, ($at), _mcount
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+ ldl OFFSET, 24 + STACKSIZE($sp)
+
+ sll LDC, ZBASE_SHIFT, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ stl tmp, 72($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ addl M, M, TMP2
+ mull TMP2, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ TMP2, C, C
+#endif
+
+#ifdef RN
+ negl OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ addl TMP1, TMP1, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ sra N, 1, J
+ ble J, $L30
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C2
+ subl C2, LDC, C1
+ subl C2, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ and M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+ ble I, $L20
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 4 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L28
+ ble L, $L25
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 4 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L28
+ ble L, $L25
+#endif
+ .align 5
+
+$L22:
+ ADD1 c09, t1, b5
+ fmov b5, c09
+// unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+// unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+// unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+// unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+// unop
+ MUL a1, b3, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+// unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+ FIMOVD b5, tmp
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+// unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+// unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+// unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+// unop
+ IFMOVD tmp, b5
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ IFMOVD tmp, b5
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+ .align 4
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+// unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+// unop
+ MUL a1, b2, t3
+ unop
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+// unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+// unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+// unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+// unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ MUL a2, b1, t2
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ MUL a1, b2, t3
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ MUL a2, b2, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, t2
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ MUL a1, b4, t3
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ ADD2 c14, t4, b5
+ fmov b5, c14
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c09, c14, b5
+ fmov b5, c09
+ ADD c10, c13, b5
+ fmov b5, c10
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c09, t1, b5
+ fmov b5, c09
+ ADD5 c10, t2, b5
+ fmov b5, c10
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c10, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L20:
+ sra M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+
+ ble I, $L29
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c07
+
+ ldi BO, 4 * SIZE(B)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(KK)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble KK, $L18
+ ble L, $L15
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c07
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(TMP1)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble TMP1, $L18
+ ble L, $L15
+#endif
+ .align 5
+
+$L12:
+/* 1 */
+ ADD1 c11, t1, b5
+ fmov b5, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+// unop
+ MUL b1, a2, t2
+ unop
+
+ ADD2 c16, t3, b5
+ fmov b5, c16
+// unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+// unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+ FIMOVD b5, tmp
+
+/* 2 */
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+// unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+// unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD1 c03, t1, b5
+ fmov b5, c03
+// unop
+ MUL b3, a1, t1
+ unop
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+// unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+// unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+// unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD1 c09, t1, b5
+ fmov b5, c09
+// unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+// unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+// unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+// unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD1 c11, t1, b5
+ fmov b5, c11
+// unop
+ IFMOVD tmp, b5
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ ldi L, -2(L)
+ IFMOVD tmp, b5
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD2 c16, t3, b5
+ fmov b5, c16
+// unop
+ MUL b2, a2, t3
+ unop
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+// unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD1 c01, t1, b5
+ fmov b5, c01
+// unop
+ IFMOVD tmp, b5
+ MUL b5, a6, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ IFMOVD tmp, b5
+ MUL b5, a4, t2
+ unop
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+// unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+// unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+// unop
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+// unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+// unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+// unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD1 c09, t1, b5
+ fmov b5, c09
+// unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+// unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+// unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD1 c11, t1, b5
+ fmov b5, c11
+// unop
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, b5
+ fmov b5, c03
+// unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+// unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+// unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+// unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+// unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+// unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+// unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+// unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL b1, a4, t2
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, t3
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, t4
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ MUL b3, a1, t1
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ MUL b3, a2, t2
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ MUL b4, a2, t3
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ MUL b2, a3, t4
+ ADD1 c09, t1,b5
+ fmov b5, c09
+ MUL b3, a3, t1
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ MUL b3, a4, t2
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ MUL b4, a4, t3
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ ADD4 c15, t4, b5
+ fmov b5, c15
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c03, c08, b5
+ fmov b5, c03
+ ADD c04, c07, b5
+ fmov b5, c04
+
+ ADD c09, c14, b5
+ fmov b5, c09
+ ADD c10, c13, b5
+ fmov b5, c10
+ ADD c11, c16, b5
+ fmov b5, c11
+ ADD c12, c15, b5
+ fmov b5, c12
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+
+ SUB b1, c03, b5
+ fmov b5, c03
+ SUB b2, c04, b5
+ fmov b5, c04
+ SUB b3, c11, b5
+ fmov b5, c11
+ SUB b4, c12, b5
+ fmov b5, c12
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c09, b5
+ fmov b5, c09
+ SUB b2, c10, b5
+ fmov b5, c10
+ SUB b3, c11, b5
+ fmov b5, c11
+ SUB b4, c12, b5
+ fmov b5, c12
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c10, t4, b5
+ fmov b5, c10
+
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+ ADD6 c09, t3, b5
+ fmov b5, c09
+ ADD5 c10, t4, b5
+ fmov b5, c10
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c09, t3
+ MUL a3, c10, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c04, t2, b5
+ fmov b5, c04
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c10, t3
+ MUL a4, c09, t4
+
+ ADD6 c03, t1, b5
+ fmov b5, c03
+ ADD5 c04, t2, b5
+ fmov b5, c04
+ ADD6 c11, t3, b5
+ fmov b5, c11
+ ADD5 c12, t4, b5
+ fmov b5, c12
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c04, t3
+ MUL a4, c03, t4
+
+ ADD6 c09, t1, b5
+ fmov b5, c09
+ ADD5 c10, t2, b5
+ fmov b5, c10
+ ADD6 c11, t3, b5
+ fmov b5, c11
+ ADD5 c12, t4, b5
+ fmov b5, c12
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+ ADD6 c03, t3, b5
+ fmov b5, c03
+ ADD5 c04, t4, b5
+ fmov b5, c04
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c04, 5 * SIZE(BO)
+ ST c11, 6 * SIZE(BO)
+ ST c12, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c11, 6 * SIZE(AO)
+ ST c12, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+ ST c11, 2 * SIZE(C2)
+ ST c12, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ fclr c01
+ fclr c05
+
+ ldi I, -1(I)
+ bgt I, $L11
+ .align 4
+
+$L29:
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L30:
+ and N, 1, J
+ ble J, $L999
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C1
+ subl C, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ and M, 1, I
+ ble I, $L50
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L58
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L58
+ ble L, $L55
+#endif
+ .align 5
+
+$L52:
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+// unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+// unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+// unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+// unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L57:
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, t2
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ MUL a1, b2, t3
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ADD2 c06, t4, b5
+ fmov b5, c06
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+
+$L58:
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L50:
+ sra M, 1, I
+ ble I, $L59
+ .align 4
+
+$L41:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi BO, 2 * SIZE(B)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(KK)
+ fclr c04
+ fclr c08
+
+ ble KK, $L48
+ ble L, $L45
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi BO, 2 * SIZE(BO)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(TMP1)
+ fclr c04
+ fclr c08
+
+ ble TMP1, $L48
+ ble L, $L45
+#endif
+ .align 5
+
+$L42:
+ ADD4 c05, t1, b5
+ fmov b5, c05
+// unop
+ MUL a1, b1, t1
+ unop
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+// unop
+
+ ADD4 c07, t3, b5
+ fmov b5, c07
+// unop
+ MUL a3, b1, t3
+ unop
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+// unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+// unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+// unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+// unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+// unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+// unop
+ MUL a2, b3, t2
+ unop
+
+ ADD4 c07, t3, b5
+ fmov b5, c07
+// unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+// unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+// unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L42
+ .align 4
+
+$L45:
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L47
+#else
+ blbs TMP1, $L47
+#endif
+ .align 4
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+// unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+// unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+// unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+// unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L47:
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ MUL a4, b1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b2, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, t2
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ MUL a3, b2, t3
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ ADD2 c08, t4, b5
+ fmov b5, c08
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c03, c08, b5
+ fmov b5, c03
+ ADD c04, c07, b5
+ fmov b5, c04
+
+$L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c04, t2, b5
+ fmov b5, c04
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c03, t1, b5
+ fmov b5, c03
+ ADD5 c04, t2, b5
+ fmov b5, c04
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L41
+ .align 4
+
+$L59:
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl tmp, 72($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak
new file mode 100644
index 0000000..71202d8
--- /dev/null
+++ b/kernel/sw_64/ztrsm_kernel_2x2_LN.S.bak
@@ -0,0 +1,2230 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW2B
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+
+ .set noat
+ .set noreorder
+ .arch ev6
+
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+#if defined(LN) || defined(LT)
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#else
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#endif
+
+
+CNAME:
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $at, _mcount
+ jsr $at, ($at), _mcount
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+ ldl OFFSET, 24 + STACKSIZE($sp)
+
+ sll LDC, ZBASE_SHIFT, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ addl M, M, TMP2
+ mull TMP2, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ TMP2, C, C
+#endif
+
+#ifdef RN
+ negl OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ addl TMP1, TMP1, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ sra N, 1, J
+ ble J, $L30
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C2
+ subl C2, LDC, C1
+ subl C2, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ and M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+ ble I, $L20
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 4 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L28
+ ble L, $L25
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 4 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L28
+ ble L, $L25
+#endif
+ .align 5
+
+$L22:
+ ADD1 c09, t1, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD2 c14, t4, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD1 c09, t1, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+ .align 4
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c09, t1, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD3 c10, t2, c10
+ MUL a2, b1, t2
+ ADD4 c13, t3, c13
+ MUL a1, b2, t3
+
+ ADD2 c14, t4, c14
+ MUL a2, b2, t4
+ ADD1 c01, t1, c01
+ MUL a1, b3, t1
+
+ ADD3 c02, t2, c02
+ MUL a2, b3, t2
+ ADD4 c05, t3, c05
+ MUL a1, b4, t3
+
+ ADD2 c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ ADD3 c10, t2, c10
+ ADD4 c13, t3, c13
+ ADD2 c14, t4, c14
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c09, c14, c09
+ ADD c10, c13, c10
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c09, t1, c09
+ ADD5 c10, t2, c10
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c10, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L20:
+ sra M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+
+ ble I, $L29
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c07
+
+ ldi BO, 4 * SIZE(B)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(KK)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble KK, $L18
+ ble L, $L15
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c07
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(TMP1)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble TMP1, $L18
+ ble L, $L15
+#endif
+ .align 5
+
+$L12:
+/* 1 */
+ ADD1 c11, t1, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD3 c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD4 c15, t4, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD1 c01, t1, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD3 c02, t2, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD2 c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD1 c11, t1, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD3 c12, t2, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD4 c15, t4, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD1 c01, t1, c01
+ unop
+ MUL b5, a6, t1
+ unop
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD2 c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD1 c03, t1, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD3 c04, t2, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD1 c11, t1, c11
+ unop
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD3 c12, t2, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD2 c06, t3, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD1 c11, t1, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD3 c12, t2, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, c02
+ MUL b1, a4, t2
+ ADD2 c06, t3, c06
+ MUL b2, a4, t3
+
+ ADD4 c05, t4, c05
+ MUL b4, a1, t4
+ ADD1 c03, t1, c03
+ MUL b3, a1, t1
+
+ ADD3 c04, t2, c04
+ MUL b3, a2, t2
+ ADD2 c08, t3, c08
+ MUL b4, a2, t3
+
+ ADD4 c13, t4, c13
+ MUL b2, a3, t4
+ ADD1 c09, t1, c09
+ MUL b3, a3, t1
+
+ ADD3 c10, t2, c10
+ MUL b3, a4, t2
+ ADD2 c14, t3, c14
+ MUL b4, a4, t3
+
+ ADD4 c07, t4, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c11, t1, c11
+ ADD3 c12, t2, c12
+ ADD2 c16, t3, c16
+ ADD4 c15, t4, c15
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c03, c08, c03
+ ADD c04, c07, c04
+
+ ADD c09, c14, c09
+ ADD c10, c13, c10
+ ADD c11, c16, c11
+ ADD c12, c15, c12
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+
+ SUB b1, c03, c03
+ SUB b2, c04, c04
+ SUB b3, c11, c11
+ SUB b4, c12, c12
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c09, c09
+ SUB b2, c10, c10
+ SUB b3, c11, c11
+ SUB b4, c12, c12
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c09, t3, c09
+ SUB c10, t4, c10
+
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+ ADD6 c09, t3, c09
+ ADD5 c10, t4, c10
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c09, t3
+ MUL a3, c10, t4
+
+ SUB c03, t1, c03
+ SUB c04, t2, c04
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c10, t3
+ MUL a4, c09, t4
+
+ ADD6 c03, t1, c03
+ ADD5 c04, t2, c04
+ ADD6 c11, t3, c11
+ ADD5 c12, t4, c12
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c04, t3
+ MUL a4, c03, t4
+
+ ADD6 c09, t1, c09
+ ADD5 c10, t2, c10
+ ADD6 c11, t3, c11
+ ADD5 c12, t4, c12
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+ ADD6 c03, t3, c03
+ ADD5 c04, t4, c04
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c04, 5 * SIZE(BO)
+ ST c11, 6 * SIZE(BO)
+ ST c12, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c11, 6 * SIZE(AO)
+ ST c12, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+ ST c11, 2 * SIZE(C2)
+ ST c12, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ fclr c01
+ fclr c05
+
+ ldi I, -1(I)
+ bgt I, $L11
+ .align 4
+
+$L29:
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L30:
+ and N, 1, J
+ ble J, $L999
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C1
+ subl C, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ and M, 1, I
+ ble I, $L50
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L58
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L58
+ ble L, $L55
+#endif
+ .align 5
+
+$L52:
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c02, t2, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c01, t1, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD1 c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c01, t1, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L57:
+ ADD3 c02, t2, c02
+ MUL a2, b1, t2
+ ADD4 c05, t3, c05
+ MUL a1, b2, t3
+
+ ADD2 c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ ADD3 c02, t2, c02
+ ADD4 c05, t3, c05
+ ADD2 c06, t4, c06
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+
+$L58:
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L50:
+ sra M, 1, I
+ ble I, $L59
+ .align 4
+
+$L41:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi BO, 2 * SIZE(B)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(KK)
+ fclr c04
+ fclr c08
+
+ ble KK, $L48
+ ble L, $L45
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi BO, 2 * SIZE(BO)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(TMP1)
+ fclr c04
+ fclr c08
+
+ ble TMP1, $L48
+ ble L, $L45
+#endif
+ .align 5
+
+$L42:
+ ADD4 c05, t1, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD2 c06, t2, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD4 c07, t3, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD4 c05, t1, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD2 c06, t2, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD4 c07, t3, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L42
+ .align 4
+
+$L45:
+ ADD4 c05, t1, c05
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L47
+#else
+ blbs TMP1, $L47
+#endif
+ .align 4
+
+ ADD2 c06, t2, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD4 c05, t1, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L47:
+ ADD2 c06, t2, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, c08
+ MUL a4, b1, t4
+ ADD1 c01, t1, c01
+ MUL a1, b2, t1
+
+ ADD3 c02, t2, c02
+ MUL a2, b2, t2
+ ADD1 c03, t3, c03
+ MUL a3, b2, t3
+
+ ADD3 c04, t4, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD4 c05, t1, c05
+ ADD2 c06, t2, c06
+ ADD4 c07, t3, c07
+ ADD2 c08, t4, c08
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c03, c08, c03
+ ADD c04, c07, c04
+
+$L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c03, t1, c03
+ SUB c04, t2, c04
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c03, t1, c03
+ ADD5 c04, t2, c04
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L41
+ .align 4
+
+$L59:
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S b/kernel/sw_64/ztrsm_kernel_2x2_LT.S
new file mode 100644
index 0000000..bb38b56
--- /dev/null
+++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S
@@ -0,0 +1,2624 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 48
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+ .set noat
+ .set noreorder
+ .arch sw6a
+
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+
+#define STACKSIZE 88
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define tmp $9
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+#if defined(LN) || defined(LT)
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#else
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#endif
+
+
+CNAME:
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $at, _mcount
+ jsr $at, ($at), _mcount
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+ ldl OFFSET, 24 + STACKSIZE($sp)
+
+ sll LDC, ZBASE_SHIFT, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ stl tmp, 72($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ addl M, M, TMP2
+ mull TMP2, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ TMP2, C, C
+#endif
+
+#ifdef RN
+ negl OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ addl TMP1, TMP1, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ sra N, 1, J
+ ble J, $L30
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C2
+ subl C2, LDC, C1
+ subl C2, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c07
+
+ ldi BO, 4 * SIZE(B)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(KK)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble KK, $L18
+ ble L, $L15
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c07
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(TMP1)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble TMP1, $L18
+ ble L, $L15
+#endif
+ .align 5
+
+$L12:
+/* 1 */
+ ADD1 c11, t1, b5
+ fmov b5, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+ FIMOVD b5, tmp
+/* 2 */
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ ldi L, -2(L)
+ IFMOVD tmp, b5
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a6, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a4, t2
+ unop
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ unop
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL b1, a4, t2
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, t3
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, t4
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ MUL b3, a1, t1
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ MUL b3, a2, t2
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ MUL b4, a2, t3
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ MUL b2, a3, t4
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ MUL b3, a3, t1
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ MUL b3, a4, t2
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ MUL b4, a4, t3
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ ADD4 c15, t4, b5
+ fmov b5, c15
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c03, c08, b5
+ fmov b5, c03
+ ADD c04, c07, b5
+ fmov b5, c04
+
+ ADD c09, c14, b5
+ fmov b5, c09
+ ADD c10, c13, b5
+ fmov b5, c10
+ ADD c11, c16, b5
+ fmov b5, c11
+ ADD c12, c15, b5
+ fmov b5, c12
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+
+ SUB b1, c03, b5
+ fmov b5, c03
+ SUB b2, c04, b5
+ fmov b5, c04
+ SUB b3, c11, b5
+ fmov b5, c11
+ SUB b4, c12, b5
+ fmov b5, c12
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c09, b5
+ fmov b5, c09
+ SUB b2, c10, b5
+ fmov b5, c10
+ SUB b3, c11, b5
+ fmov b5, c11
+ SUB b4, c12, b5
+ fmov b5, c12
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c10, t4, b5
+ fmov b5, c10
+
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+ ADD6 c09, t3, b5
+ fmov b5, c09
+ ADD5 c10, t4, b5
+ fmov b5, c10
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c09, t3
+ MUL a3, c10, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c04, t2, b5
+ fmov b5, c04
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c10, t3
+ MUL a4, c09, t4
+
+ ADD6 c03, t1, b5
+ fmov b5, c03
+ ADD5 c04, t2, b5
+ fmov b5, c04
+ ADD6 c11, t3, b5
+ fmov b5, c11
+ ADD5 c12, t4, b5
+ fmov b5, c12
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c04, t3
+ MUL a4, c03, t4
+
+ ADD6 c09, t1, b5
+ fmov b5, c09
+ ADD5 c10, t2, b5
+ fmov b5, c10
+ ADD6 c11, t3, b5
+ fmov b5, c11
+ ADD5 c12, t4, b5
+ fmov b5, c12
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+ ADD6 c03, t3, b5
+ fmov b5, c03
+ ADD5 c04, t4, b5
+ fmov b5, c04
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c04, 5 * SIZE(BO)
+ ST c11, 6 * SIZE(BO)
+ ST c12, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c11, 6 * SIZE(AO)
+ ST c12, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+ ST c11, 2 * SIZE(C2)
+ ST c12, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ fclr c01
+ fclr c05
+
+ ldi I, -1(I)
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 1, I
+ ble I, $L29
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 4 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L28
+ ble L, $L25
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 4 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L28
+ ble L, $L25
+#endif
+ .align 5
+
+$L22:
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+ FIMOVD b5, tmp
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ unop
+ IFMOVD tmp, b5
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ IFMOVD tmp, b5
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+ .align 4
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ MUL a2, b1, t2
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ MUL a1, b2, t3
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ MUL a2, b2, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, t2
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ MUL a1, b4, t3
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ ADD2 c14, t4, b5
+ fmov b5, c14
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c09, c14, b5
+ fmov b5, c09
+ ADD c10, c13, b5
+ fmov b5, c10
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c09, t1, b5
+ fmov b5, c09
+ ADD5 c10, t2, b5
+ fmov b5, c10
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c10, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L29:
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L30:
+ and N, 1, J
+ ble J, $L999
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C1
+ subl C, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 1, I
+ ble I, $L50
+ .align 4
+
+$L41:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi BO, 2 * SIZE(B)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(KK)
+ fclr c04
+ fclr c08
+
+ ble KK, $L48
+ ble L, $L45
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi BO, 2 * SIZE(BO)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(TMP1)
+ fclr c04
+ fclr c08
+
+ ble TMP1, $L48
+ ble L, $L45
+#endif
+ .align 5
+
+$L42:
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L42
+ .align 4
+
+$L45:
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L47
+#else
+ blbs TMP1, $L47
+#endif
+ .align 4
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L47:
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ MUL a4, b1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b2, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, t2
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ MUL a3, b2, t3
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ ADD2 c08, t4, b5
+ fmov b5, c08
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c03, c08, b5
+ fmov b5, c03
+ ADD c04, c07, b5
+ fmov b5, c04
+
+$L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c04, t2, b5
+ fmov b5, c04
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c03, t1, b5
+ fmov b5, c03
+ ADD5 c04, t2, b5
+ fmov b5, c04
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L41
+ .align 4
+
+$L50:
+ and M, 1, I
+ ble I, $L59
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L58
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L58
+ ble L, $L55
+#endif
+ .align 5
+
+$L52:
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L57:
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, t2
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ MUL a1, b2, t3
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ADD2 c06, t4, b5
+ fmov b5, c06
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+
+$L58:
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L59:
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl tmp, 72($sp)
+
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak
new file mode 100644
index 0000000..f4a2c13
--- /dev/null
+++ b/kernel/sw_64/ztrsm_kernel_2x2_LT.S.bak
@@ -0,0 +1,2222 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW2B
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+
+ .set noat
+ .set noreorder
+ .arch ev6
+
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+#if defined(LN) || defined(LT)
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#else
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#endif
+
+
+CNAME:
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $at, _mcount
+ jsr $at, ($at), _mcount
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+ ldl OFFSET, 24 + STACKSIZE($sp)
+
+ sll LDC, ZBASE_SHIFT, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ addl M, M, TMP2
+ mull TMP2, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ TMP2, C, C
+#endif
+
+#ifdef RN
+ negl OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ addl TMP1, TMP1, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ sra N, 1, J
+ ble J, $L30
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C2
+ subl C2, LDC, C1
+ subl C2, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c07
+
+ ldi BO, 4 * SIZE(B)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(KK)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble KK, $L18
+ ble L, $L15
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c07
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(TMP1)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble TMP1, $L18
+ ble L, $L15
+#endif
+ .align 5
+
+$L12:
+/* 1 */
+ ADD1 c11, t1, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD3 c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD4 c15, t4, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD1 c01, t1, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD3 c02, t2, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD2 c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD1 c11, t1, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD3 c12, t2, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD4 c15, t4, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD1 c01, t1, c01
+ unop
+ MUL b5, a6, t1
+ unop
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD2 c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD1 c03, t1, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD3 c04, t2, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD1 c11, t1, c11
+ unop
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD3 c12, t2, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD2 c06, t3, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD1 c11, t1, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD3 c12, t2, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, c02
+ MUL b1, a4, t2
+ ADD2 c06, t3, c06
+ MUL b2, a4, t3
+
+ ADD4 c05, t4, c05
+ MUL b4, a1, t4
+ ADD1 c03, t1, c03
+ MUL b3, a1, t1
+
+ ADD3 c04, t2, c04
+ MUL b3, a2, t2
+ ADD2 c08, t3, c08
+ MUL b4, a2, t3
+
+ ADD4 c13, t4, c13
+ MUL b2, a3, t4
+ ADD1 c09, t1, c09
+ MUL b3, a3, t1
+
+ ADD3 c10, t2, c10
+ MUL b3, a4, t2
+ ADD2 c14, t3, c14
+ MUL b4, a4, t3
+
+ ADD4 c07, t4, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c11, t1, c11
+ ADD3 c12, t2, c12
+ ADD2 c16, t3, c16
+ ADD4 c15, t4, c15
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c03, c08, c03
+ ADD c04, c07, c04
+
+ ADD c09, c14, c09
+ ADD c10, c13, c10
+ ADD c11, c16, c11
+ ADD c12, c15, c12
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+
+ SUB b1, c03, c03
+ SUB b2, c04, c04
+ SUB b3, c11, c11
+ SUB b4, c12, c12
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c09, c09
+ SUB b2, c10, c10
+ SUB b3, c11, c11
+ SUB b4, c12, c12
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c09, t3, c09
+ SUB c10, t4, c10
+
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+ ADD6 c09, t3, c09
+ ADD5 c10, t4, c10
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c09, t3
+ MUL a3, c10, t4
+
+ SUB c03, t1, c03
+ SUB c04, t2, c04
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c10, t3
+ MUL a4, c09, t4
+
+ ADD6 c03, t1, c03
+ ADD5 c04, t2, c04
+ ADD6 c11, t3, c11
+ ADD5 c12, t4, c12
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c04, t3
+ MUL a4, c03, t4
+
+ ADD6 c09, t1, c09
+ ADD5 c10, t2, c10
+ ADD6 c11, t3, c11
+ ADD5 c12, t4, c12
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+ ADD6 c03, t3, c03
+ ADD5 c04, t4, c04
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c04, 5 * SIZE(BO)
+ ST c11, 6 * SIZE(BO)
+ ST c12, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c11, 6 * SIZE(AO)
+ ST c12, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+ ST c11, 2 * SIZE(C2)
+ ST c12, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ fclr c01
+ fclr c05
+
+ ldi I, -1(I)
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 1, I
+ ble I, $L29
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 4 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L28
+ ble L, $L25
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 4 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L28
+ ble L, $L25
+#endif
+ .align 5
+
+$L22:
+ ADD1 c09, t1, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD2 c14, t4, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD1 c09, t1, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+ .align 4
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c09, t1, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD3 c10, t2, c10
+ MUL a2, b1, t2
+ ADD4 c13, t3, c13
+ MUL a1, b2, t3
+
+ ADD2 c14, t4, c14
+ MUL a2, b2, t4
+ ADD1 c01, t1, c01
+ MUL a1, b3, t1
+
+ ADD3 c02, t2, c02
+ MUL a2, b3, t2
+ ADD4 c05, t3, c05
+ MUL a1, b4, t3
+
+ ADD2 c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ ADD3 c10, t2, c10
+ ADD4 c13, t3, c13
+ ADD2 c14, t4, c14
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c09, c14, c09
+ ADD c10, c13, c10
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c09, t1, c09
+ ADD5 c10, t2, c10
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c10, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L29:
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L30:
+ and N, 1, J
+ ble J, $L999
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C1
+ subl C, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 1, I
+ ble I, $L50
+ .align 4
+
+$L41:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi BO, 2 * SIZE(B)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(KK)
+ fclr c04
+ fclr c08
+
+ ble KK, $L48
+ ble L, $L45
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi BO, 2 * SIZE(BO)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(TMP1)
+ fclr c04
+ fclr c08
+
+ ble TMP1, $L48
+ ble L, $L45
+#endif
+ .align 5
+
+$L42:
+ ADD4 c05, t1, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD2 c06, t2, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD4 c07, t3, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD4 c05, t1, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD2 c06, t2, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD4 c07, t3, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L42
+ .align 4
+
+$L45:
+ ADD4 c05, t1, c05
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L47
+#else
+ blbs TMP1, $L47
+#endif
+ .align 4
+
+ ADD2 c06, t2, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD4 c05, t1, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L47:
+ ADD2 c06, t2, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, c08
+ MUL a4, b1, t4
+ ADD1 c01, t1, c01
+ MUL a1, b2, t1
+
+ ADD3 c02, t2, c02
+ MUL a2, b2, t2
+ ADD1 c03, t3, c03
+ MUL a3, b2, t3
+
+ ADD3 c04, t4, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD4 c05, t1, c05
+ ADD2 c06, t2, c06
+ ADD4 c07, t3, c07
+ ADD2 c08, t4, c08
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c03, c08, c03
+ ADD c04, c07, c04
+
+$L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c03, t1, c03
+ SUB c04, t2, c04
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c03, t1, c03
+ ADD5 c04, t2, c04
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L41
+ .align 4
+
+$L50:
+ and M, 1, I
+ ble I, $L59
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L58
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L58
+ ble L, $L55
+#endif
+ .align 5
+
+$L52:
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c02, t2, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c01, t1, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD1 c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c01, t1, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L57:
+ ADD3 c02, t2, c02
+ MUL a2, b1, t2
+ ADD4 c05, t3, c05
+ MUL a1, b2, t3
+
+ ADD2 c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ ADD3 c02, t2, c02
+ ADD4 c05, t3, c05
+ ADD2 c06, t4, c06
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+
+$L58:
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L59:
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S b/kernel/sw_64/ztrsm_kernel_2x2_RT.S
new file mode 100644
index 0000000..97dbc16
--- /dev/null
+++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S
@@ -0,0 +1,2623 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#if !defined(EV4) && !defined(EV5) && !defined(SW6)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW6
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+#ifdef EV5
+#define PREFETCHSIZE 48
+#define UNOP
+#endif
+
+#ifdef EV4
+#define UNOP
+#endif
+
+ .set noat
+ .set noreorder
+ .arch sw6a
+
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+
+#define STACKSIZE 88
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define tmp $9
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+#if defined(LN) || defined(LT)
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#else
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#endif
+
+
+CNAME:
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $at, _mcount
+ jsr $at, ($at), _mcount
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+ ldl OFFSET, 24 + STACKSIZE($sp)
+
+ sll LDC, ZBASE_SHIFT, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+ stl tmp, 72($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ addl M, M, TMP2
+ mull TMP2, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ TMP2, C, C
+#endif
+
+#ifdef RN
+ negl OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ addl TMP1, TMP1, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ and N, 1, J
+ ble J, $L30
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C1
+ subl C, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 1, I
+ ble I, $L50
+ .align 4
+
+$L41:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi BO, 2 * SIZE(B)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(KK)
+ fclr c04
+ fclr c08
+
+ ble KK, $L48
+ ble L, $L45
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi BO, 2 * SIZE(BO)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(TMP1)
+ fclr c04
+ fclr c08
+
+ ble TMP1, $L48
+ ble L, $L45
+#endif
+ .align 5
+
+$L42:
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L42
+ .align 4
+
+$L45:
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L47
+#else
+ blbs TMP1, $L47
+#endif
+ .align 4
+
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L47:
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, b5
+ fmov b5, c08
+ MUL a4, b1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b2, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b2, t2
+ ADD1 c03, t3, b5
+ fmov b5, c03
+ MUL a3, b2, t3
+
+ ADD3 c04, t4, b5
+ fmov b5, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD4 c05, t1, b5
+ fmov b5, c05
+ ADD2 c06, t2, b5
+ fmov b5, c06
+ ADD4 c07, t3, b5
+ fmov b5, c07
+ ADD2 c08, t4, b5
+ fmov b5, c08
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c03, c08, b5
+ fmov b5, c03
+ ADD c04, c07, b5
+ fmov b5, c04
+
+$L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c04, t2, b5
+ fmov b5, c04
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c03, t1, b5
+ fmov b5, c03
+ ADD5 c04, t2, b5
+ fmov b5, c04
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L41
+ .align 4
+
+$L50:
+ and M, 1, I
+ ble I, $L59
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L58
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L58
+ ble L, $L55
+#endif
+ .align 5
+
+$L52:
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L57:
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b1, t2
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ MUL a1, b2, t3
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ ADD2 c06, t4, b5
+ fmov b5, c06
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+
+$L58:
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L59:
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L30:
+ sra N, 1, J
+ ble J, $L999
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C2
+ subl C2, LDC, C1
+ subl C2, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c07
+
+ ldi BO, 4 * SIZE(B)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(KK)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble KK, $L18
+ ble L, $L15
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c07
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(TMP1)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble TMP1, $L18
+ ble L, $L15
+#endif
+ .align 5
+
+$L12:
+/* 1 */
+ ADD1 c11, t1, b5
+ fmov b5, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+ FIMOVD b5, tmp
+/* 2 */
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ ldi L, -2(L)
+ IFMOVD tmp, b5
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a6, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ IFMOVD tmp, b5
+ MUL b5, a4, t2
+ unop
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ unop
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, b5
+ fmov b5, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL b1, a4, t2
+ ADD2 c06, t3, b5
+ fmov b5, c06
+ MUL b2, a4, t3
+
+ ADD4 c05, t4, b5
+ fmov b5, c05
+ MUL b4, a1, t4
+ ADD1 c03, t1, b5
+ fmov b5, c03
+ MUL b3, a1, t1
+
+ ADD3 c04, t2, b5
+ fmov b5, c04
+ MUL b3, a2, t2
+ ADD2 c08, t3, b5
+ fmov b5, c08
+ MUL b4, a2, t3
+
+ ADD4 c13, t4, b5
+ fmov b5, c13
+ MUL b2, a3, t4
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ MUL b3, a3, t1
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ MUL b3, a4, t2
+ ADD2 c14, t3, b5
+ fmov b5, c14
+ MUL b4, a4, t3
+
+ ADD4 c07, t4, b5
+ fmov b5, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c11, t1, b5
+ fmov b5, c11
+ ADD3 c12, t2, b5
+ fmov b5, c12
+ ADD2 c16, t3, b5
+ fmov b5, c16
+ ADD4 c15, t4, b5
+ fmov b5, c15
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c03, c08, b5
+ fmov b5, c03
+ ADD c04, c07, b5
+ fmov b5, c04
+
+ ADD c09, c14, b5
+ fmov b5, c09
+ ADD c10, c13, b5
+ fmov b5, c10
+ ADD c11, c16, b5
+ fmov b5, c11
+ ADD c12, c15, b5
+ fmov b5, c12
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+
+ SUB b1, c03, b5
+ fmov b5, c03
+ SUB b2, c04, b5
+ fmov b5, c04
+ SUB b3, c11, b5
+ fmov b5, c11
+ SUB b4, c12, b5
+ fmov b5, c12
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c03, b5
+ fmov b5, c03
+ SUB a4, c04, b5
+ fmov b5, c04
+
+ SUB b1, c09, b5
+ fmov b5, c09
+ SUB b2, c10, b5
+ fmov b5, c10
+ SUB b3, c11, b5
+ fmov b5, c11
+ SUB b4, c12, b5
+ fmov b5, c12
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c09, t3, b5
+ fmov b5, c09
+ SUB c10, t4, b5
+ fmov b5, c10
+
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+ ADD6 c09, t3, b5
+ fmov b5, c09
+ ADD5 c10, t4, b5
+ fmov b5, c10
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c09, t3
+ MUL a3, c10, t4
+
+ SUB c03, t1, b5
+ fmov b5, c03
+ SUB c04, t2, b5
+ fmov b5, c04
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c10, t3
+ MUL a4, c09, t4
+
+ ADD6 c03, t1, b5
+ fmov b5, c03
+ ADD5 c04, t2, b5
+ fmov b5, c04
+ ADD6 c11, t3, b5
+ fmov b5, c11
+ ADD5 c12, t4, b5
+ fmov b5, c12
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c03, t1, b5
+ fmov b5, c03
+ ADD6 c04, t2, b5
+ fmov b5, c04
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+ SUB c11, t3, b5
+ fmov b5, c11
+ SUB c12, t4, b5
+ fmov b5, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c04, t3
+ MUL a4, c03, t4
+
+ ADD6 c09, t1, b5
+ fmov b5, c09
+ ADD5 c10, t2, b5
+ fmov b5, c10
+ ADD6 c11, t3, b5
+ fmov b5, c11
+ ADD5 c12, t4, b5
+ fmov b5, c12
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+ MUL a1, c11, b5
+ fmov b5, c11
+ MUL a1, c12, b5
+ fmov b5, c12
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+ ADD5 c11, t3, b5
+ fmov b5, c11
+ ADD6 c12, t4, b5
+ fmov b5, c12
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+ SUB c03, t3, b5
+ fmov b5, c03
+ SUB c04, t4, b5
+ fmov b5, c04
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+ ADD6 c03, t3, b5
+ fmov b5, c03
+ ADD5 c04, t4, b5
+ fmov b5, c04
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c03, b5
+ fmov b5, c03
+ MUL a1, c04, b5
+ fmov b5, c04
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c03, t3, b5
+ fmov b5, c03
+ ADD6 c04, t4, b5
+ fmov b5, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c04, 5 * SIZE(BO)
+ ST c11, 6 * SIZE(BO)
+ ST c12, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c11, 6 * SIZE(AO)
+ ST c12, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+ ST c11, 2 * SIZE(C2)
+ ST c12, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ fclr c01
+ fclr c05
+
+ ldi I, -1(I)
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 1, I
+ ble I, $L29
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 4 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L28
+ ble L, $L25
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 4 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L28
+ ble L, $L25
+#endif
+ .align 5
+
+$L22:
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+ FIMOVD b5, tmp
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ unop
+ IFMOVD tmp, b5
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ IFMOVD tmp, b5
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+ .align 4
+
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ MUL a2, b1, t2
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ MUL a1, b2, t3
+
+ ADD2 c14, t4, b5
+ fmov b5, c14
+ MUL a2, b2, t4
+ ADD1 c01, t1, b5
+ fmov b5, c01
+ MUL a1, b3, t1
+
+ ADD3 c02, t2, b5
+ fmov b5, c02
+ MUL a2, b3, t2
+ ADD4 c05, t3, b5
+ fmov b5, c05
+ MUL a1, b4, t3
+
+ ADD2 c06, t4, b5
+ fmov b5, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c09, t1, b5
+ fmov b5, c09
+ ADD3 c10, t2, b5
+ fmov b5, c10
+ ADD4 c13, t3, b5
+ fmov b5, c13
+ ADD2 c14, t4, b5
+ fmov b5, c14
+
+ ADD c01, c06, b5
+ fmov b5, c01
+ ADD c02, c05, b5
+ fmov b5, c02
+ ADD c09, c14, b5
+ fmov b5, c09
+ ADD c10, c13, b5
+ fmov b5, c10
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, b5
+ fmov b5, c01
+ SUB a2, c02, b5
+ fmov b5, c02
+ SUB a3, c09, b5
+ fmov b5, c09
+ SUB a4, c10, b5
+ fmov b5, c10
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+ ADD5 c09, t3, b5
+ fmov b5, c09
+ ADD6 c10, t4, b5
+ fmov b5, c10
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ SUB c09, t1, b5
+ fmov b5, c09
+ SUB c10, t2, b5
+ fmov b5, c10
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c09, t1, b5
+ fmov b5, c09
+ ADD5 c10, t2, b5
+ fmov b5, c10
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, b5
+ fmov b5, c09
+ MUL a1, c10, b5
+ fmov b5, c10
+
+ ADD5 c09, t1, b5
+ fmov b5, c09
+ ADD6 c10, t2, b5
+ fmov b5, c10
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ SUB c01, t1, b5
+ fmov b5, c01
+ SUB c02, t2, b5
+ fmov b5, c02
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ ADD6 c01, t1, b5
+ fmov b5, c01
+ ADD5 c02, t2, b5
+ fmov b5, c02
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, b5
+ fmov b5, c01
+ MUL a1, c02, b5
+ fmov b5, c02
+
+ ADD5 c01, t1, b5
+ fmov b5, c01
+ ADD6 c02, t2, b5
+ fmov b5, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c10, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L29:
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ ldl tmp, 72($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak
new file mode 100644
index 0000000..4d4f59d
--- /dev/null
+++ b/kernel/sw_64/ztrsm_kernel_2x2_RT.S.bak
@@ -0,0 +1,2223 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+
+#if !defined(SW2B)
+#error "Architecture is not specified."
+#endif
+
+#ifdef SW2B
+#define PREFETCHSIZE 56
+#define UNOP unop
+#endif
+
+
+ .set noat
+ .set noreorder
+ .arch ev6
+
+.text
+ .align 5
+ .globl CNAME
+ .ent CNAME
+
+#define STACKSIZE 80
+
+#define M $16
+#define N $17
+#define K $18
+#define A $21
+#define B $22
+#define C $20
+#define LDC $23
+
+#define C1 $19
+#define C2 $24
+
+#define AO $at
+#define BO $5
+#define I $6
+#define J $7
+#define L $8
+
+#define a1 $f16
+#define a2 $f17
+#define a3 $f18
+#define a4 $f19
+
+#define b1 $f20
+#define b2 $f21
+#define b3 $f22
+#define b4 $f23
+
+#define t1 $f24
+#define t2 $f25
+#define t3 $f26
+#define t4 $f27
+
+#define a5 $f28
+#define a6 $f30
+#define b5 $f29
+
+#define alpha_i $f29
+#define alpha_r $f30
+
+#define c01 $f0
+#define c02 $f1
+#define c03 $f2
+#define c04 $f3
+
+#define c05 $f4
+#define c06 $f5
+#define c07 $f6
+#define c08 $f7
+
+#define c09 $f8
+#define c10 $f9
+#define c11 $f10
+#define c12 $f11
+
+#define c13 $f12
+#define c14 $f13
+#define c15 $f14
+#define c16 $f15
+
+#define TMP1 $0
+#define TMP2 $1
+#define KK $2
+#define AORIG $3
+#define OFFSET $4
+
+#if defined(LN) || defined(LT)
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 SUB
+#define ADD4 ADD
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#else
+#ifndef CONJ
+#define ADD1 ADD
+#define ADD2 SUB
+#define ADD3 ADD
+#define ADD4 ADD
+#define ADD5 SUB
+#define ADD6 ADD
+#else
+#define ADD1 ADD
+#define ADD2 ADD
+#define ADD3 ADD
+#define ADD4 SUB
+#define ADD5 ADD
+#define ADD6 SUB
+#endif
+#endif
+
+
+CNAME:
+ .frame $sp, STACKSIZE, $26, 0
+
+#ifdef PROFILE
+ ldgp $gp, 0($27)
+ ldi $at, _mcount
+ jsr $at, ($at), _mcount
+#endif
+
+#ifndef PROFILE
+ .prologue 0
+#else
+ .prologue 1
+#endif
+
+ ldi $sp, -STACKSIZE($sp)
+
+ ldl B, 0 + STACKSIZE($sp)
+ ldl C, 8 + STACKSIZE($sp)
+ ldl LDC, 16 + STACKSIZE($sp)
+ ldl OFFSET, 24 + STACKSIZE($sp)
+
+ sll LDC, ZBASE_SHIFT, LDC
+
+ fstd $f2, 0($sp)
+ fstd $f3, 8($sp)
+ fstd $f4, 16($sp)
+ fstd $f5, 24($sp)
+ fstd $f6, 32($sp)
+ fstd $f7, 40($sp)
+ fstd $f8, 48($sp)
+ fstd $f9, 56($sp)
+
+ cmple M, 0, $0
+ cmple N, 0, $1
+ cmple K, 0, $2
+
+ or $0, $1, $0
+ or $0, $2, $0
+ bne $0, $L999
+
+#ifdef LN
+ addl M, M, TMP2
+ mull TMP2, K, TMP1
+ SXADDQ TMP1, A, A
+ SXADDQ TMP2, C, C
+#endif
+
+#ifdef RN
+ negl OFFSET, KK
+#endif
+
+#ifdef RT
+ mull N, K, TMP1
+ addl TMP1, TMP1, TMP1
+ SXADDQ TMP1, B, B
+
+ mull N, LDC, TMP1
+ addl TMP1, C, C
+
+ subl N, OFFSET, KK
+#endif
+
+ and N, 1, J
+ ble J, $L30
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C1
+ subl C, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 1, I
+ ble I, $L50
+ .align 4
+
+$L41:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi BO, 2 * SIZE(B)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(KK)
+ fclr c04
+ fclr c08
+
+ ble KK, $L48
+ ble L, $L45
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi BO, 2 * SIZE(BO)
+ fclr c03
+ ldi AO, 4 * SIZE(AO)
+ fclr c07
+
+ ldi L, -2(TMP1)
+ fclr c04
+ fclr c08
+
+ ble TMP1, $L48
+ ble L, $L45
+#endif
+ .align 5
+
+$L42:
+ ADD4 c05, t1, c05
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD2 c06, t2, c06
+ ldi L, -2(L)
+ MUL a2, b1, t2
+ unop
+
+ ADD4 c07, t3, c07
+ unop
+ MUL a3, b1, t3
+ unop
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 2 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ ldi BO, 4 * SIZE(BO)
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ unop
+ MUL a4, b2, t4
+ LD a5, 3 * SIZE(AO)
+
+ ADD4 c05, t1, c05
+ unop
+ MUL a1, b3, t1
+ LD b2, -1 * SIZE(BO)
+
+ ADD2 c06, t2, c06
+ unop
+ MUL a2, b3, t2
+ unop
+
+ ADD4 c07, t3, c07
+ unop
+ MUL a3, b3, t3
+ ldi AO, 8 * SIZE(AO)
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a5, b3, t4
+ LD b3, 0 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b4, t1
+ LD a1, -4 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b4, t2
+ LD a2, -3 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ LD a4, -1 * SIZE(AO)
+ MUL a3, b4, t3
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ MUL a5, b4, t4
+ LD b4, 1 * SIZE(BO)
+ bgt L, $L42
+ .align 4
+
+$L45:
+ ADD4 c05, t1, c05
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L47
+#else
+ blbs TMP1, $L47
+#endif
+ .align 4
+
+ ADD2 c06, t2, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, c08
+ unop
+ MUL a4, b1, t4
+ LD b1, 0 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b2, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b2, t2
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c03, t3, c03
+ unop
+ MUL a3, b2, t3
+ LD a3, 2 * SIZE(AO)
+
+ ADD3 c04, t4, c04
+ MUL a4, b2, t4
+ LD a4, 3 * SIZE(AO)
+ ldi AO, 4 * SIZE(AO)
+
+ ADD4 c05, t1, c05
+ LD b2, 1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 2 * SIZE(BO)
+ .align 4
+
+$L47:
+ ADD2 c06, t2, c06
+ MUL a2, b1, t2
+ ADD4 c07, t3, c07
+ MUL a3, b1, t3
+
+ ADD2 c08, t4, c08
+ MUL a4, b1, t4
+ ADD1 c01, t1, c01
+ MUL a1, b2, t1
+
+ ADD3 c02, t2, c02
+ MUL a2, b2, t2
+ ADD1 c03, t3, c03
+ MUL a3, b2, t3
+
+ ADD3 c04, t4, c04
+ ldi AO, 4 * SIZE(AO)
+ MUL a4, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD4 c05, t1, c05
+ ADD2 c06, t2, c06
+ ADD4 c07, t3, c07
+ ADD2 c08, t4, c08
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c03, c08, c03
+ ADD c04, c07, c04
+
+$L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 1, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+
+ SUB c03, t1, c03
+ SUB c04, t2, c04
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c03, t1, c03
+ ADD5 c04, t2, c04
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c03, 2 * SIZE(BO)
+ ST c04, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+
+ ldi I, -1(I)
+ bgt I, $L41
+ .align 4
+
+$L50:
+ and M, 1, I
+ ble I, $L59
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(B)
+ fclr c01
+ LD b2, 1 * SIZE(B)
+ fclr c05
+
+ LD b3, 2 * SIZE(B)
+ fclr c02
+ LD b4, 3 * SIZE(B)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L58
+ ble L, $L55
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr t1
+ LD a2, 1 * SIZE(AO)
+ fclr t2
+ LD a3, 2 * SIZE(AO)
+ fclr t3
+ LD a4, 3 * SIZE(AO)
+ fclr t4
+
+ LD b1, 0 * SIZE(BO)
+ fclr c01
+ LD b2, 1 * SIZE(BO)
+ fclr c05
+
+ LD b3, 2 * SIZE(BO)
+ fclr c02
+ LD b4, 3 * SIZE(BO)
+ fclr c06
+
+ ldi AO, 2 * SIZE(AO)
+ ldi BO, 2 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L58
+ ble L, $L55
+#endif
+ .align 5
+
+$L52:
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c02, t2, c02
+ ldi AO, 4 * SIZE(AO)
+ MUL a2, b1, t2
+ LD b1, 2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ ldi L, -2(L)
+ MUL a1, b2, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c01, t1, c01
+ LD b2, 3 * SIZE(BO)
+ MUL a3, b3, t1
+ ldi BO, 4 * SIZE(BO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, 0 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a3, b4, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a4, b4, t4
+ LD b4, 1 * SIZE(BO)
+ unop
+
+ LD a4, 1 * SIZE(AO)
+ unop
+ unop
+ bgt L, $L52
+ .align 4
+
+$L55:
+ ADD1 c01, t1, c01
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L57
+#else
+ blbs TMP1, $L57
+#endif
+ .align 4
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ ldi BO, 2 * SIZE(BO)
+ MUL a1, b2, t3
+ LD a1, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b2, t4
+ LD a2, 1 * SIZE(AO)
+
+ ADD1 c01, t1, c01
+ LD b2, -1 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi AO, 2 * SIZE(AO)
+ .align 4
+
+$L57:
+ ADD3 c02, t2, c02
+ MUL a2, b1, t2
+ ADD4 c05, t3, c05
+ MUL a1, b2, t3
+
+ ADD2 c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b2, t4
+ ldi BO, 2 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ ADD3 c02, t2, c02
+ ADD4 c05, t3, c05
+ ADD2 c06, t4, c06
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+
+$L58:
+#if defined(LN) || defined(RT)
+ subl KK, 1, TMP1
+
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -2 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(RN) || defined(RT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L59:
+#ifdef LN
+ sll K, ZBASE_SHIFT, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 1, KK
+#endif
+
+#ifdef RT
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L30:
+ sra N, 1, J
+ ble J, $L999
+ .align 4
+
+$L01:
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl B, TMP1, B
+
+ subl C, LDC, C2
+ subl C2, LDC, C1
+ subl C2, LDC, C
+#else
+ mov C, C1
+ addl C, LDC, C2
+ addl C2, LDC, C
+#endif
+
+#ifdef LN
+ addl M, OFFSET, KK
+#endif
+
+#ifdef LT
+ mov OFFSET, KK
+#endif
+
+#if defined(LN) || defined(RT)
+ mov A, AORIG
+#else
+ mov A, AO
+#endif
+
+ sra M, 1, I
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+ fclr c01
+ fclr c05
+
+ ble I, $L20
+ .align 4
+
+$L11:
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ fclr c03
+ LD b4, 3 * SIZE(B)
+ fclr c07
+
+ ldi BO, 4 * SIZE(B)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(KK)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble KK, $L18
+ ble L, $L15
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AO
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ fclr c03
+ LD b4, 3 * SIZE(BO)
+ fclr c07
+
+ ldi BO, 4 * SIZE(BO)
+ fclr c11
+ ldi AO, 4 * SIZE(AO)
+ fclr c15
+
+ fillcs 4 * SIZE(C1)
+ fclr c04
+ ldi L, -2(TMP1)
+ fclr c08
+
+ fillcs 4 * SIZE(C2)
+ fclr c12
+ fclr c16
+ ble TMP1, $L18
+ ble L, $L15
+#endif
+ .align 5
+
+$L12:
+/* 1 */
+ ADD1 c11, t1, c11
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(AO)
+#else
+ unop
+#endif
+ MUL b1, a1, t1
+#ifndef EV4
+ fillcs PREFETCHSIZE * SIZE(BO)
+#else
+ unop
+#endif
+
+ ADD3 c12, t2, c12
+ unop
+ MUL b1, a2, t2
+ unop
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ LD a5, 0 * SIZE(AO)
+
+ ADD4 c15, t4, c15
+ unop
+ MUL b2, a1, t4
+ LD b5, 0 * SIZE(BO)
+
+/* 2 */
+ ADD1 c01, t1, c01
+ UNOP
+ MUL b1, a3, t1
+ UNOP
+
+ ADD3 c02, t2, c02
+ UNOP
+ MUL b1, a4, t2
+ UNOP
+
+ ADD2 c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, c05
+ unop
+ MUL b4, a1, t4
+ unop
+
+/* 3 */
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ unop
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+/* 4 */
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ LD a6, 2 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, 3 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD b4, 3 * SIZE(BO)
+
+/* 5 */
+ ADD1 c11, t1, c11
+ unop
+ MUL b5, a5, t1
+ LD a1, 4 * SIZE(AO)
+
+ ADD3 c12, t2, c12
+ ldi L, -2(L)
+ MUL b5, a2, t2
+ LD b1, 4 * SIZE(BO)
+
+ ADD2 c16, t3, c16
+ unop
+ MUL b2, a2, t3
+ unop
+
+ ADD4 c15, t4, c15
+ unop
+ MUL b2, a5, t4
+ unop
+
+/* 6 */
+ ADD1 c01, t1, c01
+ unop
+ MUL b5, a6, t1
+ unop
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b5, a4, t2
+ unop
+
+ ADD2 c06, t3, c06
+ unop
+ MUL b2, a4, t3
+ unop
+
+ ADD4 c05, t4, c05
+ unop
+ MUL b4, a5, t4
+ unop
+
+/* 7 */
+ ADD1 c03, t1, c03
+ ldi AO, 8 * SIZE(AO)
+ MUL b3, a5, t1
+ unop
+
+ ADD3 c04, t2, c04
+ ldi BO, 8 * SIZE(BO)
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, -3 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a6, t4
+ LD b2, -3 * SIZE(BO)
+
+/* 8 */
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a6, t1
+ LD a3, -2 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ MUL b4, a6, t4
+ LD b4, -1 * SIZE(BO)
+ bgt L, $L12
+ .align 4
+
+$L15:
+ ADD1 c11, t1, c11
+ unop
+ MUL b1, a1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L17
+#else
+ blbs TMP1, $L17
+#endif
+ .align 4
+
+ ADD3 c12, t2, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, c02
+ unop
+ MUL b1, a4, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD2 c06, t3, c06
+ MUL b2, a4, t3
+ ADD4 c05, t4, c05
+ MUL b4, a1, t4
+
+ ADD1 c03, t1, c03
+ unop
+ MUL b3, a1, t1
+ LD a1, 0 * SIZE(AO)
+
+ ADD3 c04, t2, c04
+ unop
+ MUL b3, a2, t2
+ unop
+
+ ADD2 c08, t3, c08
+ unop
+ MUL b4, a2, t3
+ LD a2, 1 * SIZE(AO)
+
+ ADD4 c13, t4, c13
+ unop
+ MUL b2, a3, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ unop
+ MUL b3, a3, t1
+ ldi AO, 4 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL b3, a4, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD2 c14, t3, c14
+ unop
+ MUL b4, a4, t3
+ LD a4, -1 * SIZE(AO)
+
+ ADD4 c07, t4, c07
+ unop
+ MUL b4, a3, t4
+ LD a3, -2 * SIZE(AO)
+
+ ADD1 c11, t1, c11
+ LD b4, 3 * SIZE(BO)
+ MUL b1, a1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L17:
+ ADD3 c12, t2, c12
+ MUL b1, a2, t2
+ ADD2 c16, t3, c16
+ MUL b2, a2, t3
+
+ ADD4 c15, t4, c15
+ MUL b2, a1, t4
+ ADD1 c01, t1, c01
+ MUL b1, a3, t1
+
+ ADD3 c02, t2, c02
+ MUL b1, a4, t2
+ ADD2 c06, t3, c06
+ MUL b2, a4, t3
+
+ ADD4 c05, t4, c05
+ MUL b4, a1, t4
+ ADD1 c03, t1, c03
+ MUL b3, a1, t1
+
+ ADD3 c04, t2, c04
+ MUL b3, a2, t2
+ ADD2 c08, t3, c08
+ MUL b4, a2, t3
+
+ ADD4 c13, t4, c13
+ MUL b2, a3, t4
+ ADD1 c09, t1, c09
+ MUL b3, a3, t1
+
+ ADD3 c10, t2, c10
+ MUL b3, a4, t2
+ ADD2 c14, t3, c14
+ MUL b4, a4, t3
+
+ ADD4 c07, t4, c07
+ ldi AO, 4 * SIZE(AO)
+ MUL b4, a3, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c11, t1, c11
+ ADD3 c12, t2, c12
+ ADD2 c16, t3, c16
+ ADD4 c15, t4, c15
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c03, c08, c03
+ ADD c04, c07, c04
+
+ ADD c09, c14, c09
+ ADD c10, c13, c10
+ ADD c11, c16, c11
+ ADD c12, c15, c12
+ .align 4
+
+$L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 2, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -4 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ LD b1, 4 * SIZE(BO)
+ LD b2, 5 * SIZE(BO)
+ LD b3, 6 * SIZE(BO)
+ LD b4, 7 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+
+ SUB b1, c03, c03
+ SUB b2, c04, c04
+ SUB b3, c11, c11
+ SUB b4, c12, c12
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ LD b1, 4 * SIZE(AO)
+ LD b2, 5 * SIZE(AO)
+ LD b3, 6 * SIZE(AO)
+ LD b4, 7 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c03, c03
+ SUB a4, c04, c04
+
+ SUB b1, c09, c09
+ SUB b2, c10, c10
+ SUB b3, c11, c11
+ SUB b4, c12, c12
+#endif
+
+#ifdef LN
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+ LD a3, 4 * SIZE(AO)
+ LD a4, 5 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+
+ MUL a3, c03, t1
+ MUL a3, c04, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c09, t3, c09
+ SUB c10, t4, c10
+
+ MUL a4, c04, t1
+ MUL a4, c03, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+ ADD6 c09, t3, c09
+ ADD5 c10, t4, c10
+
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+#endif
+
+#ifdef LT
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c09, t3
+ MUL a3, c10, t4
+
+ SUB c03, t1, c03
+ SUB c04, t2, c04
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c10, t3
+ MUL a4, c09, t4
+
+ ADD6 c03, t1, c03
+ ADD5 c04, t2, c04
+ ADD6 c11, t3, c11
+ ADD5 c12, t4, c12
+
+ LD a1, 6 * SIZE(AO)
+ LD a2, 7 * SIZE(AO)
+
+ MUL a2, c04, t1
+ MUL a2, c03, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c03, t1, c03
+ ADD6 c04, t2, c04
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ MUL a3, c03, t3
+ MUL a3, c04, t4
+
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+ SUB c11, t3, c11
+ SUB c12, t4, c12
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ MUL a4, c04, t3
+ MUL a4, c03, t4
+
+ ADD6 c09, t1, c09
+ ADD5 c10, t2, c10
+ ADD6 c11, t3, c11
+ ADD5 c12, t4, c12
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a2, c12, t3
+ MUL a2, c11, t4
+
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+ MUL a1, c11, c11
+ MUL a1, c12, c12
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+ ADD5 c11, t3, c11
+ ADD6 c12, t4, c12
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ MUL a3, c11, t3
+ MUL a3, c12, t4
+
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+ SUB c03, t3, c03
+ SUB c04, t4, c04
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ MUL a4, c12, t3
+ MUL a4, c11, t4
+
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+ ADD6 c03, t3, c03
+ ADD5 c04, t4, c04
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c04, t3
+ MUL a2, c03, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c03, c03
+ MUL a1, c04, c04
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c03, t3, c03
+ ADD6 c04, t4, c04
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+
+ ST c03, 4 * SIZE(BO)
+ ST c04, 5 * SIZE(BO)
+ ST c11, 6 * SIZE(BO)
+ ST c12, 7 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c03, 2 * SIZE(AO)
+ ST c04, 3 * SIZE(AO)
+
+ ST c09, 4 * SIZE(AO)
+ ST c10, 5 * SIZE(AO)
+ ST c11, 6 * SIZE(AO)
+ ST c12, 7 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -4 * SIZE(C1)
+ ldi C2, -4 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c03, 2 * SIZE(C1)
+ ST c04, 3 * SIZE(C1)
+
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+ ST c11, 2 * SIZE(C2)
+ ST c12, 3 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 4 * SIZE(C1)
+ ldi C2, 4 * SIZE(C2)
+#endif
+
+ fclr t1
+ fclr t2
+ fclr t3
+ fclr t4
+
+#ifdef RT
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 1, TMP1
+ addl AO, TMP1, AO
+ addl BO, TMP1, BO
+#endif
+
+#ifdef LT
+ addl KK, 2, KK
+#endif
+
+#ifdef LN
+ subl KK, 2, KK
+#endif
+ fclr c01
+ fclr c05
+
+ ldi I, -1(I)
+ bgt I, $L11
+ .align 4
+
+$L20:
+ and M, 1, I
+ ble I, $L29
+
+#if defined(LT) || defined(RN)
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(B)
+ fclr c10
+ LD b2, 1 * SIZE(B)
+ fclr c14
+
+ LD b3, 2 * SIZE(B)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(B)
+ ldi BO, 4 * SIZE(B)
+
+ ldi L, -2(KK)
+
+ ble KK, $L28
+ ble L, $L25
+#else
+#ifdef LN
+ sll K, ZBASE_SHIFT + 0, TMP1
+ subl AORIG, TMP1, AORIG
+#endif
+
+ sll KK, ZBASE_SHIFT + 0, TMP1
+ addl AORIG, TMP1, AO
+ sll KK, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, BO
+
+ subl K, KK, TMP1
+
+ LD a1, 0 * SIZE(AO)
+ fclr c09
+ LD a2, 1 * SIZE(AO)
+ fclr c13
+
+ LD a3, 2 * SIZE(AO)
+ fclr c02
+ LD a4, 3 * SIZE(AO)
+ fclr c06
+
+ LD b1, 0 * SIZE(BO)
+ fclr c10
+ LD b2, 1 * SIZE(BO)
+ fclr c14
+
+ LD b3, 2 * SIZE(BO)
+ ldi AO, 2 * SIZE(AO)
+ LD b4, 3 * SIZE(BO)
+ ldi BO, 4 * SIZE(BO)
+
+ ldi L, -2(TMP1)
+
+ ble TMP1, $L28
+ ble L, $L25
+#endif
+ .align 5
+
+$L22:
+ ADD1 c09, t1, c09
+ unop
+ MUL a1, b1, t1
+ unop
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ ldi BO, 8 * SIZE(BO)
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, -7 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ unop
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, -6 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, 2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a2, b4, t4
+ LD b5, -5 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ unop
+ MUL a3, b1, t1
+ LD a2, 3 * SIZE(AO)
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a4, b1, t2
+ LD b1, -4 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a3, b2, t3
+ ldi AO, 4 * SIZE(AO)
+
+ ADD2 c14, t4, c14
+ MUL a4, b2, t4
+ LD b2, -3 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ ldi L, -2(L)
+ MUL a3, b3, t1
+ LD b4, -1 * SIZE(BO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a4, b3, t2
+ LD b3, -2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a3, b5, t3
+ LD a3, 0 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ MUL a4, b5, t4
+ LD a4, 1 * SIZE(AO)
+ bgt L, $L22
+ .align 4
+
+$L25:
+ ADD1 c09, t1, c09
+ MUL a1, b1, t1
+#if defined(LT) || defined(RN)
+ blbs KK, $L27
+#else
+ blbs TMP1, $L27
+#endif
+ .align 4
+
+ ADD3 c10, t2, c10
+ unop
+ MUL a2, b1, t2
+ LD b1, 0 * SIZE(BO)
+
+ ADD4 c13, t3, c13
+ unop
+ MUL a1, b2, t3
+ unop
+
+ ADD2 c14, t4, c14
+ unop
+ MUL a2, b2, t4
+ LD b2, 1 * SIZE(BO)
+
+ ADD1 c01, t1, c01
+ unop
+ MUL a1, b3, t1
+ ldi AO, 2 * SIZE(AO)
+
+ ADD3 c02, t2, c02
+ unop
+ MUL a2, b3, t2
+ LD b3, 2 * SIZE(BO)
+
+ ADD4 c05, t3, c05
+ unop
+ MUL a1, b4, t3
+ LD a1, -2 * SIZE(AO)
+
+ ADD2 c06, t4, c06
+ unop
+ MUL a2, b4, t4
+ LD a2, -1 * SIZE(AO)
+
+ ADD1 c09, t1, c09
+ LD b4, 3 * SIZE(BO)
+ MUL a1, b1, t1
+ ldi BO, 4 * SIZE(BO)
+ .align 4
+
+$L27:
+ ADD3 c10, t2, c10
+ MUL a2, b1, t2
+ ADD4 c13, t3, c13
+ MUL a1, b2, t3
+
+ ADD2 c14, t4, c14
+ MUL a2, b2, t4
+ ADD1 c01, t1, c01
+ MUL a1, b3, t1
+
+ ADD3 c02, t2, c02
+ MUL a2, b3, t2
+ ADD4 c05, t3, c05
+ MUL a1, b4, t3
+
+ ADD2 c06, t4, c06
+ ldi AO, 2 * SIZE(AO)
+ MUL a2, b4, t4
+ ldi BO, 4 * SIZE(BO)
+
+ ADD1 c09, t1, c09
+ ADD3 c10, t2, c10
+ ADD4 c13, t3, c13
+ ADD2 c14, t4, c14
+
+ ADD c01, c06, c01
+ ADD c02, c05, c02
+ ADD c09, c14, c09
+ ADD c10, c13, c10
+ .align 4
+
+$L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+ subl KK, 1, TMP1
+#else
+ subl KK, 2, TMP1
+#endif
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AORIG, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl B, TMP2, BO
+#else
+ ldi AO, -2 * SIZE(AO)
+ ldi BO, -4 * SIZE(BO)
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+#else
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+ LD a3, 2 * SIZE(AO)
+ LD a4, 3 * SIZE(AO)
+
+ SUB a1, c01, c01
+ SUB a2, c02, c02
+ SUB a3, c09, c09
+ SUB a4, c10, c10
+#endif
+
+#if defined(LN) || defined(LT)
+ LD a1, 0 * SIZE(AO)
+ LD a2, 1 * SIZE(AO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a2, c10, t3
+ MUL a2, c09, t4
+
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+ ADD5 c09, t3, c09
+ ADD6 c10, t4, c10
+#endif
+
+#ifdef RN
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+ LD a3, 2 * SIZE(BO)
+ LD a4, 3 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+
+ MUL a3, c01, t1
+ MUL a3, c02, t2
+ SUB c09, t1, c09
+ SUB c10, t2, c10
+
+ MUL a4, c02, t1
+ MUL a4, c01, t2
+ ADD6 c09, t1, c09
+ ADD5 c10, t2, c10
+
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+#endif
+
+#ifdef RT
+ LD a1, 6 * SIZE(BO)
+ LD a2, 7 * SIZE(BO)
+ LD a3, 4 * SIZE(BO)
+ LD a4, 5 * SIZE(BO)
+
+ MUL a2, c10, t1
+ MUL a2, c09, t2
+ MUL a1, c09, c09
+ MUL a1, c10, c10
+
+ ADD5 c09, t1, c09
+ ADD6 c10, t2, c10
+
+ MUL a3, c09, t1
+ MUL a3, c10, t2
+ SUB c01, t1, c01
+ SUB c02, t2, c02
+
+ MUL a4, c10, t1
+ MUL a4, c09, t2
+ ADD6 c01, t1, c01
+ ADD5 c02, t2, c02
+
+ LD a1, 0 * SIZE(BO)
+ LD a2, 1 * SIZE(BO)
+
+ MUL a2, c02, t1
+ MUL a2, c01, t2
+ MUL a1, c01, c01
+ MUL a1, c02, c02
+
+ ADD5 c01, t1, c01
+ ADD6 c02, t2, c02
+#endif
+
+#if defined(LN) || defined(LT)
+ ST c01, 0 * SIZE(BO)
+ ST c02, 1 * SIZE(BO)
+ ST c09, 2 * SIZE(BO)
+ ST c10, 3 * SIZE(BO)
+#else
+ ST c01, 0 * SIZE(AO)
+ ST c02, 1 * SIZE(AO)
+ ST c09, 2 * SIZE(AO)
+ ST c10, 3 * SIZE(AO)
+#endif
+
+#ifdef LN
+ ldi C1, -2 * SIZE(C1)
+ ldi C2, -2 * SIZE(C2)
+#endif
+
+ ST c01, 0 * SIZE(C1)
+ ST c02, 1 * SIZE(C1)
+ ST c09, 0 * SIZE(C2)
+ ST c10, 1 * SIZE(C2)
+
+#ifndef LN
+ ldi C1, 2 * SIZE(C1)
+ ldi C2, 2 * SIZE(C2)
+#endif
+
+#ifdef RT
+ sll K, ZBASE_SHIFT, TMP1
+ addl AORIG, TMP1, AORIG
+#endif
+
+#if defined(LT) || defined(RN)
+ subl K, KK, TMP1
+ sll TMP1, ZBASE_SHIFT + 0, TMP2
+ addl AO, TMP2, AO
+ sll TMP1, ZBASE_SHIFT + 1, TMP2
+ addl BO, TMP2, BO
+#endif
+
+#ifdef LT
+ addl KK, 1, KK
+#endif
+
+#ifdef LN
+ subl KK, 1, KK
+#endif
+ .align 4
+
+$L29:
+#ifdef LN
+ sll K, ZBASE_SHIFT + 1, TMP1
+ addl B, TMP1, B
+#endif
+
+#if defined(LT) || defined(RN)
+ mov BO, B
+#endif
+
+#ifdef RN
+ addl KK, 2, KK
+#endif
+
+#ifdef RT
+ subl KK, 2, KK
+#endif
+
+ ldi J, -1(J)
+ bgt J, $L01
+ .align 4
+
+$L999:
+ fldd $f2, 0($sp)
+ fldd $f3, 8($sp)
+ fldd $f4, 16($sp)
+ fldd $f5, 24($sp)
+ fldd $f6, 32($sp)
+ fldd $f7, 40($sp)
+ fldd $f8, 48($sp)
+ fldd $f9, 56($sp)
+ clr $0
+ ldi $sp, STACKSIZE($sp)
+ ret
+ .ident VERSION
+ .end CNAME
diff --git a/lapack/laswp/sw_64/Makefile b/lapack/laswp/sw_64/Makefile
new file mode 100644
index 0000000..af1f019
--- /dev/null
+++ b/lapack/laswp/sw_64/Makefile
@@ -0,0 +1,8 @@
+TOPDIR = ../../..
+include ../../../Makefile.system
+
+LASWP = ../generic/laswp_k_1.c
+ZLASWP = ../generic/zlaswp_k_1.c
+
+include ../generic/Makefile
+
diff --git a/param.h b/param.h
index ee4640f..1a5f361 100644
--- a/param.h
+++ b/param.h
@@ -2128,7 +2128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
-#if defined(EV4) || defined(EV5) || defined(EV6)
+#if defined(EV4) || defined(EV5) || defined(SW6)
#ifdef EV4
#define SNUMOPT 1
@@ -2140,7 +2140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 512
#define GEMM_DEFAULT_OFFSET_B 512
-#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+//#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@@ -2185,7 +2186,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_Q 64
#endif
-#ifdef EV6
+#ifdef SW6
#define SGEMM_DEFAULT_P 256
#define SGEMM_DEFAULT_Q 512
--
2.31.1