Commit 8ea975d0 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

+20% tune-up for Power5.

parent 6c69aa53
Loading
Loading
Loading
Loading
+11 −18
Original line number Diff line number Diff line
@@ -162,17 +162,16 @@ $code=<<___;
.align	4
L1st:
	$LDX	$aj,$ap,$j	; ap[j]
	$LDX	$nj,$np,$j	; np[j]
	addc	$lo0,$alo,$hi0
	$LDX	$nj,$np,$j	; np[j]
	addze	$hi0,$ahi
	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
	$UMULH	$ahi,$aj,$m0

	addc	$lo1,$nlo,$hi1
	$UMULH	$ahi,$aj,$m0
	addze	$hi1,$nhi
	$UMULL	$nlo,$nj,$m1	; np[j]*m1
	$UMULH	$nhi,$nj,$m1
	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
	$UMULH	$nhi,$nj,$m1
	addze	$hi1,$hi1
	$ST	$lo1,0($tp)	; tp[j-1]

@@ -206,20 +205,16 @@ Louter:
	$LD	$aj,$BNSZ($ap)	; ap[1]
	$LD	$nj,0($np)	; np[0]
	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
	addze	$hi0,$hi0

	$UMULL	$m1,$lo0,$n0	; tp[0]*n0

	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
	$UMULH	$ahi,$aj,$m0

	$UMULL	$lo1,$nj,$m1	; np[0]*m1
	$UMULH	$hi1,$nj,$m1
	$LD	$nj,$BNSZ($np)	; np[1]
	addc	$lo1,$lo1,$lo0
	addze	$hi1,$hi1

	$UMULL	$nlo,$nj,$m1	; np[1]*m1
	addze	$hi1,$hi1
	$UMULH	$nhi,$nj,$m1

	mtctr	$num
@@ -227,24 +222,22 @@ Louter:
.align	4
Linner:
	$LDX	$aj,$ap,$j	; ap[j]
	$LD	$tj,$BNSZ($tp)	; tp[j]
	addc	$lo0,$alo,$hi0
	$LD	$tj,$BNSZ($tp)	; tp[j]
	addze	$hi0,$ahi
	$LDX	$nj,$np,$j	; np[j]
	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
	addze	$hi0,$hi0
	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
	$UMULH	$ahi,$aj,$m0

	addc	$lo1,$nlo,$hi1
	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
	addze	$hi1,$nhi
	$UMULH	$ahi,$aj,$m0
	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
	$UMULL	$nlo,$nj,$m1	; np[j]*m1
	addze	$hi0,$hi0
	$UMULH	$nhi,$nj,$m1
	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
	addi	$j,$j,$BNSZ	; j++
	addze	$hi1,$hi1
	$ST	$lo1,0($tp)	; tp[j-1]

	addi	$j,$j,$BNSZ	; j++
	addi	$tp,$tp,$BNSZ	; tp++
	bdnz-	Linner
;Linner