Commit 1c3d2b94 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

This is "informational" commit. Its mere purpose is to expose "modulo

factor" in inner loops.
parent 48d2335d
Loading
Loading
Loading
Loading
+190 −12
Original line number Diff line number Diff line
@@ -286,19 +286,16 @@ $fname:
	!or	%o7,%o0,%o0		! 64-bit result
	srlx	%o3,16,%g1		! 34-bit carry

	ba	.L1st
	add	$j,8,$j
.align	32
.L1st:
	add	$ap,$j,%o3
	add	$np,$j,%o4
	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
	add	$ap,$j,%o4
	add	$np,$j,%o5
	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
	fzeros	$alo
	ld	[%o3+4],$ahi_
	ld	[%o4+4],$ahi_
	fzeros	$ahi
	ld	[%o4+0],$nlo_	! load n[j] as pair of 32-bit words
	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
	fzeros	$nlo
	ld	[%o4+4],$nhi_
	ld	[%o5+4],$nhi_
	fzeros	$nhi

	fxtod	$alo,$alo
@@ -350,6 +347,11 @@ $fname:
	std	$nlob,[%sp+$bias+$frame+8]
	std	$nloc,[%sp+$bias+$frame+16]
	std	$nlod,[%sp+$bias+$frame+24]

	addcc	$j,8,$j
	bz,pn	%icc,.L1stskip
.align	32,0x1000000
.L1st:
	ldx	[%sp+$bias+$frame+0],%o0
	ldx	[%sp+$bias+$frame+8],%o1
	ldx	[%sp+$bias+$frame+16],%o2
@@ -376,9 +378,101 @@ $fname:
	add	%g1,1,%g1

	stx	%o0,[$tp]		! tp[j-1]=


	add	$ap,$j,%o4
	add	$np,$j,%o5
	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
	fzeros	$alo
	ld	[%o4+4],$ahi_
	fzeros	$ahi
	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
	fzeros	$nlo
	ld	[%o5+4],$nhi_
	fzeros	$nhi

	fxtod	$alo,$alo
	fxtod	$ahi,$ahi
	fxtod	$nlo,$nlo
	fxtod	$nhi,$nhi

	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
		fmuld	$alo,$ba,$aloa
	std	$ahi,[$ap_h+$j]
		fmuld	$nlo,$na,$nloa
	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
		fmuld	$alo,$bb,$alob
	std	$nhi,[$np_h+$j]
		fmuld	$nlo,$nb,$nlob
		fmuld	$alo,$bc,$aloc
	faddd	$aloa,$nloa,$nloa
		fmuld	$nlo,$nc,$nloc
		fmuld	$alo,$bd,$alod
	faddd	$alob,$nlob,$nlob
		fmuld	$nlo,$nd,$nlod
		fmuld	$ahi,$ba,$ahia
	faddd	$aloc,$nloc,$nloc
		fmuld	$nhi,$na,$nhia
		fmuld	$ahi,$bb,$ahib
	faddd	$alod,$nlod,$nlod
		fmuld	$nhi,$nb,$nhib
		fmuld	$ahi,$bc,$ahic
	faddd	$ahia,$nhia,$nhia
		fmuld	$nhi,$nc,$nhic
		fmuld	$ahi,$bd,$ahid
	faddd	$ahib,$nhib,$nhib
		fmuld	$nhi,$nd,$nhid

	faddd	$dota,$nloa,$nloa
	faddd	$dotb,$nlob,$nlob
	faddd	$ahic,$nhic,$dota	! $nhic
	faddd	$ahid,$nhid,$dotb	! $nhid

	faddd	$nloc,$nhia,$nloc
	faddd	$nlod,$nhib,$nlod

	fdtox	$nloa,$nloa
	fdtox	$nlob,$nlob
	fdtox	$nloc,$nloc
	fdtox	$nlod,$nlod

	std	$nloa,[%sp+$bias+$frame+0]
	std	$nlob,[%sp+$bias+$frame+8]
	std	$nloc,[%sp+$bias+$frame+16]
	std	$nlod,[%sp+$bias+$frame+24]

	addcc	$j,8,$j
	bnz,pt	%icc,.L1st
	add	$tp,8,$tp

.L1stskip:
	ldx	[%sp+$bias+$frame+0],%o0
	ldx	[%sp+$bias+$frame+8],%o1
	ldx	[%sp+$bias+$frame+16],%o2
	ldx	[%sp+$bias+$frame+24],%o3

	srlx	%o0,16,%o7
	add	%o7,%o1,%o1
	srlx	%o1,16,%o7
	add	%o7,%o2,%o2
	srlx	%o2,16,%o7
	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
	and	%o0,$mask,%o0
	and	%o1,$mask,%o1
	and	%o2,$mask,%o2
	sllx	%o1,16,%o1
	sllx	%o2,32,%o2
	sllx	%o3,48,%o7
	or	%o1,%o0,%o0
	or	%o2,%o0,%o0
	or	%o7,%o0,%o0		! 64-bit result
	addcc	%g1,%o0,%o0
	srlx	%o3,16,%g1		! 34-bit carry
	bcs,a	%xcc,.+8
	add	%g1,1,%g1

	stx	%o0,[$tp]		! tp[j-1]=
	add	$tp,8,$tp

	fdtox	$dota,$dota
	fdtox	$dotb,$dotb
@@ -514,10 +608,7 @@ $fname:
	bcs,a	%xcc,.+8
	add	%g1,1,%g1

	ba	.Linner
	add	$j,8,$j
.align	32
.Linner:
	ldd	[$ap_l+$j],$alo		! load a[j] in double format
	ldd	[$ap_h+$j],$ahi
	ldd	[$np_l+$j],$nlo		! load n[j] in double format
@@ -563,6 +654,11 @@ $fname:
	std	$nlob,[%sp+$bias+$frame+8]
	std	$nloc,[%sp+$bias+$frame+16]
	std	$nlod,[%sp+$bias+$frame+24]

	addcc	$j,8,$j
	bz,pn	%icc,.Linnerskip
.align	32,0x1000000
.Linner:
	ldx	[%sp+$bias+$frame+0],%o0
	ldx	[%sp+$bias+$frame+8],%o1
	ldx	[%sp+$bias+$frame+16],%o2
@@ -594,9 +690,91 @@ $fname:
	add	%g1,1,%g1

	stx	%o0,[$tp]		! tp[j-1]


	ldd	[$ap_l+$j],$alo		! load a[j] in double format
	ldd	[$ap_h+$j],$ahi
	ldd	[$np_l+$j],$nlo		! load n[j] in double format
	ldd	[$np_h+$j],$nhi

		fmuld	$alo,$ba,$aloa
		fmuld	$nlo,$na,$nloa
		fmuld	$alo,$bb,$alob
		fmuld	$nlo,$nb,$nlob
		fmuld	$alo,$bc,$aloc
	faddd	$aloa,$nloa,$nloa
		fmuld	$nlo,$nc,$nloc
		fmuld	$alo,$bd,$alod
	faddd	$alob,$nlob,$nlob
		fmuld	$nlo,$nd,$nlod
		fmuld	$ahi,$ba,$ahia
	faddd	$aloc,$nloc,$nloc
		fmuld	$nhi,$na,$nhia
		fmuld	$ahi,$bb,$ahib
	faddd	$alod,$nlod,$nlod
		fmuld	$nhi,$nb,$nhib
		fmuld	$ahi,$bc,$ahic
	faddd	$ahia,$nhia,$nhia
		fmuld	$nhi,$nc,$nhic
		fmuld	$ahi,$bd,$ahid
	faddd	$ahib,$nhib,$nhib
		fmuld	$nhi,$nd,$nhid

	faddd	$dota,$nloa,$nloa
	faddd	$dotb,$nlob,$nlob
	faddd	$ahic,$nhic,$dota	! $nhic
	faddd	$ahid,$nhid,$dotb	! $nhid

	faddd	$nloc,$nhia,$nloc
	faddd	$nlod,$nhib,$nlod

	fdtox	$nloa,$nloa
	fdtox	$nlob,$nlob
	fdtox	$nloc,$nloc
	fdtox	$nlod,$nlod

	std	$nloa,[%sp+$bias+$frame+0]
	std	$nlob,[%sp+$bias+$frame+8]
	std	$nloc,[%sp+$bias+$frame+16]
	std	$nlod,[%sp+$bias+$frame+24]

	addcc	$j,8,$j
	bnz,pt	%icc,.Linner
	add	$tp,8,$tp

.Linnerskip:
	ldx	[%sp+$bias+$frame+0],%o0
	ldx	[%sp+$bias+$frame+8],%o1
	ldx	[%sp+$bias+$frame+16],%o2
	ldx	[%sp+$bias+$frame+24],%o3

	srlx	%o0,16,%o7
	add	%o7,%o1,%o1
	srlx	%o1,16,%o7
	add	%o7,%o2,%o2
	srlx	%o2,16,%o7
	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
	and	%o0,$mask,%o0
	and	%o1,$mask,%o1
	and	%o2,$mask,%o2
	sllx	%o1,16,%o1
	sllx	%o2,32,%o2
	sllx	%o3,48,%o7
	or	%o1,%o0,%o0
	or	%o2,%o0,%o0
	or	%o7,%o0,%o0		! 64-bit result
	addcc	%g1,%o0,%o0
	srlx	%o3,16,%g1		! 34-bit carry
	bcs,a	%xcc,.+8
	add	%g1,1,%g1

	ldx	[$tp+8],%o7		! tp[j]
	addcc	%o7,%o0,%o0
	bcs,a	%xcc,.+8
	add	%g1,1,%g1

	stx	%o0,[$tp]		! tp[j-1]
	add	$tp,8,$tp

	fdtox	$dota,$dota
	fdtox	$dotb,$dotb