Commit 2c5d4daa authored by Andy Polyakov's avatar Andy Polyakov
Browse files

Yet another "teaser" Montgomery multiplication module, for PowerPC.

parent b4634358
Loading
Loading
Loading
Loading
+9 −8
Original line number Diff line number Diff line
@@ -314,7 +314,7 @@ my %table=(
# *-generic* is endian-neutral target, but ./config is free to
# throw in -D[BL]_ENDIAN, whichever appropriate...
"linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ppc",	"gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o:::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ppc",	"gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o linix_ppc32-mont.o:::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
#### IA-32 targets...
"linux-ia32-icc",	"icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-elf",	"gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -322,7 +322,7 @@ my %table=(
####
"linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
# -bpowerpc64-linux is transient option, -m64 should be the one to use...
"linux-ppc64",	"gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o:::::::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ppc64",	"gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o linux_ppc64-mont.o:::::::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64",	"gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@@ -407,12 +407,12 @@ my %table=(

#### IBM's AIX.
"aix3-cc",  "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::",
"aix-gcc",  "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o:::::::::::dlfcn:",
"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o:::::::::::dlfcn::::::-X64",
"aix-gcc",  "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::::::::dlfcn:",
"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::::::::dlfcn::::::-X64",
# Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE
# at build time. $OBJECT_MODE is respected at ./config stage!
"aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o:::::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o:::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
"aix-cc",   "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",

#
# Cray T90 and similar (SDSC)
@@ -504,9 +504,10 @@ my %table=(

##### MacOS X (a.k.a. Rhapsody or Darwin) setup
"rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::",
"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"darwin64-ppc-cc","cc:-m64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc64.o osx_ppc64-mont.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-m64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"darwin-i386-cc","cc:-O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o:::::::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
"debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",

##### A/UX
"aux3-gcc","gcc:-O2 -DTERMIO::(unknown):AUX:-lbsd:RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::",
+8 −0
Original line number Diff line number Diff line
@@ -120,6 +120,14 @@ linux_ppc64.s: asm/ppc.pl; $(PERL) $< $@
aix_ppc32.s: asm/ppc.pl;	$(PERL) asm/ppc.pl $@
aix_ppc64.s: asm/ppc.pl;	$(PERL) asm/ppc.pl $@
osx_ppc32.s: asm/ppc.pl;	$(PERL) $< $@
osx_ppc64.s: asm/ppc.pl;	$(PERL) $< $@

linux_ppc32-mont.s: asm/ppc-mont.pl;	$(PERL) $< $@
linux_ppc64-mont.s: asm/ppc-mont.pl;	$(PERL) $< $@
aix_ppc32-mont.s: asm/ppc-mont.pl;	$(PERL) asm/ppc-mont.pl $@
aix_ppc64-mont.s: asm/ppc-mont.pl;	$(PERL) asm/ppc-mont.pl $@
osx_ppc32-mont.s: asm/ppc-mont.pl;	$(PERL) $< $@
osx_ppc64-mont.s: asm/ppc-mont.pl;	$(PERL) $< $@

files:
	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+327 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================

# April 2006

# "Teaser" Montgomery multiplication module for PowerPC. It's possible
# to gain a bit more by modulo-scheduling outer loop, then dedicated
# squaring procedure should give further 20% and code can be adapted
# for 32-bit application running on 64-bit CPU. As for the latter.
# It won't be able to achieve "native" 64-bit performance, because in
# 32-bit application context every addc instruction will have to be
# expanded as addc, twice right shift by 32 and finally adde, etc.
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
# for 64-bit application running on PPC970/G5 is:
#
# 512-bit	+65%	
# 1024-bit	+35%
# 2048-bit	+18%
# 4096-bit	+4%

$output = shift;

if ($output =~ /32\-mont\.s/) {
	$BITS=	32;
	$BNSZ=	$BITS/8;
	$SIZE_T=4;
	$RZONE=	224;
	$FRAME=	$SIZE_T*16;

	$LD=	"lwz";		# load
	$LDU=	"lwzu";		# load and update
	$LDX=	"lwzx";		# load indexed
	$ST=	"stw";		# store
	$STU=	"stwu";		# store and update
	$STX=	"stwx";		# store indexed
	$STUX=	"stwux";	# store indexed and update
	$UMULL=	"mullw";	# unsigned multiply low
	$UMULH=	"mulhwu";	# unsigned multiply high
	$UCMP=	"cmplw";	# unsigned compare
	$PUSH=	$ST;
	$POP=	$LD;
} elsif ($output =~ /64\-mont\.s/) {
	$BITS=	64;
	$BNSZ=	$BITS/8;
	$SIZE_T=8;
	$RZONE=	288;
	$FRAME=	$SIZE_T*16;

	# same as above, but 64-bit mnemonics...
	$LD=	"ld";		# load
	$LDU=	"ldu";		# load and update
	$LDX=	"ldx";		# load indexed
	$ST=	"std";		# store
	$STU=	"stdu";		# store and update
	$STX=	"stdx";		# store indexed
	$STUX=	"stdux";	# store indexed and update
	$UMULL=	"mulld";	# unsigned multiply low
	$UMULH=	"mulhdu";	# unsigned multiply high
	$UCMP=	"cmpld";	# unsigned compare
	$PUSH=	$ST;
	$POP=	$LD;
} else { die "nonsense $output"; }

( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
	die "can't call ../perlasm/ppc-xlate.pl: $!";

$sp="r1";
$toc="r2";
$rp="r3";	$ovf="r3";
$ap="r4";
$bp="r5";
$np="r6";
$n0="r7";
$num="r8";
$rp="r9";	# $rp is reassigned
$aj="r10";
$nj="r11";
$tj="r12";
# non-volatile registers
$i="r14";
$j="r15";
$tp="r16";
$m0="r17";
$m1="r18";
$lo0="r19";
$hi0="r20";
$lo1="r21";
$hi1="r22";
$alo="r23";
$ahi="r24";
$nlo="r25";
#
$nhi="r0";

$code=<<___;
.text

.globl	.bn_mul_mont
.align	4
.bn_mul_mont:
	cmpwi	$num,4
	mr	$rp,r3		; $rp is reassigned
	li	r3,0
	bltlr

	slwi	$num,$num,`log($BNSZ)/log(2)`
	li	$tj,-4096
	addi	$ovf,$num,`$FRAME+$RZONE`
	subf	$ovf,$ovf,$sp	; $sp-$ovf
	and	$ovf,$ovf,$tj	; minimize TLB usage
	subf	$ovf,$sp,$ovf	; $ovf-$sp
	srwi	$num,$num,`log($BNSZ)/log(2)`
	$STUX	$sp,$sp,$ovf

	$PUSH	r14,`4*$SIZE_T`($sp)
	$PUSH	r15,`5*$SIZE_T`($sp)
	$PUSH	r16,`6*$SIZE_T`($sp)
	$PUSH	r17,`7*$SIZE_T`($sp)
	$PUSH	r18,`8*$SIZE_T`($sp)
	$PUSH	r19,`9*$SIZE_T`($sp)
	$PUSH	r20,`10*$SIZE_T`($sp)
	$PUSH	r21,`11*$SIZE_T`($sp)
	$PUSH	r22,`12*$SIZE_T`($sp)
	$PUSH	r23,`13*$SIZE_T`($sp)
	$PUSH	r24,`14*$SIZE_T`($sp)
	$PUSH	r25,`15*$SIZE_T`($sp)

	$LD	$n0,0($n0)	; pull n0[0] value
	addi	$num,$num,-2	; adjust $num for counter register

	$LD	$m0,0($bp)	; m0=bp[0]
	$LD	$aj,0($ap)	; ap[0]
	addi	$tp,$sp,$FRAME
	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
	$UMULH	$hi0,$aj,$m0

	$LD	$aj,$BNSZ($ap)	; ap[1]
	$LD	$nj,0($np)	; np[0]

	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0

	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
	$UMULH	$ahi,$aj,$m0

	$UMULL	$lo1,$nj,$m1	; np[0]*m1
	$UMULH	$hi1,$nj,$m1
	$LD	$nj,$BNSZ($np)	; np[1]
	addc	$lo1,$lo1,$lo0
	addze	$hi1,$hi1

	$UMULL	$nlo,$nj,$m1	; np[1]*m1
	$UMULH	$nhi,$nj,$m1

	mtctr	$num
	li	$j,`2*$BNSZ`
.align	4
L1st:
	$LDX	$aj,$ap,$j	; ap[j]
	$LDX	$nj,$np,$j	; np[j]
	addc	$lo0,$alo,$hi0
	addze	$hi0,$ahi
	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
	$UMULH	$ahi,$aj,$m0

	addc	$lo1,$nlo,$hi1
	addze	$hi1,$nhi
	$UMULL	$nlo,$nj,$m1	; np[j]*m1
	$UMULH	$nhi,$nj,$m1
	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
	addze	$hi1,$hi1
	$ST	$lo1,0($tp)	; tp[j-1]

	addi	$j,$j,$BNSZ	; j++
	addi	$tp,$tp,$BNSZ	; tp++
	bdnz-	L1st
;L1st
	addc	$lo0,$alo,$hi0
	addze	$hi0,$ahi

	addc	$lo1,$nlo,$hi1
	addze	$hi1,$nhi
	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
	addze	$hi1,$hi1
	$ST	$lo1,0($tp)	; tp[j-1]

	li	$ovf,0
	addc	$hi1,$hi1,$hi0
	addze	$ovf,$ovf	; upmost overflow bit
	$ST	$hi1,$BNSZ($tp)

	li	$i,$BNSZ
.align	4
Louter:
	$LDX	$m0,$bp,$i	; m0=bp[i]
	$LD	$aj,0($ap)	; ap[0]
	addi	$tp,$sp,$FRAME
	$LD	$tj,$FRAME($sp)	; tp[0]
	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
	$UMULH	$hi0,$aj,$m0
	$LD	$aj,$BNSZ($ap)	; ap[1]
	$LD	$nj,0($np)	; np[0]
	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
	addze	$hi0,$hi0

	$UMULL	$m1,$lo0,$n0	; tp[0]*n0

	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
	$UMULH	$ahi,$aj,$m0

	$UMULL	$lo1,$nj,$m1	; np[0]*m1
	$UMULH	$hi1,$nj,$m1
	$LD	$nj,$BNSZ($np)	; np[1]
	addc	$lo1,$lo1,$lo0
	addze	$hi1,$hi1

	$UMULL	$nlo,$nj,$m1	; np[1]*m1
	$UMULH	$nhi,$nj,$m1

	mtctr	$num
	li	$j,`2*$BNSZ`
.align	4
Linner:
	$LDX	$aj,$ap,$j	; ap[j]
	$LD	$tj,$BNSZ($tp)	; tp[j]
	addc	$lo0,$alo,$hi0
	addze	$hi0,$ahi
	$LDX	$nj,$np,$j	; np[j]
	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
	addze	$hi0,$hi0
	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
	$UMULH	$ahi,$aj,$m0

	addc	$lo1,$nlo,$hi1
	addze	$hi1,$nhi
	$UMULL	$nlo,$nj,$m1	; np[j]*m1
	$UMULH	$nhi,$nj,$m1
	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
	addze	$hi1,$hi1
	$ST	$lo1,0($tp)	; tp[j-1]

	addi	$j,$j,$BNSZ	; j++
	addi	$tp,$tp,$BNSZ	; tp++
	bdnz-	Linner
;Linner
	$LD	$tj,$BNSZ($tp)	; tp[j]
	addc	$lo0,$alo,$hi0
	addze	$hi0,$ahi
	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
	addze	$hi0,$hi0

	addc	$lo1,$nlo,$hi1
	addze	$hi1,$nhi
	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
	addze	$hi1,$hi1
	$ST	$lo1,0($tp)	; tp[j-1]

	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
	li	$ovf,0
	adde	$hi1,$hi1,$hi0
	addze	$ovf,$ovf
	$ST	$hi1,$BNSZ($tp)
;
	slwi	$tj,$num,`log($BNSZ)/log(2)`
	$UCMP	$i,$tj
	addi	$i,$i,$BNSZ
	ble-	Louter

	addi	$num,$num,2	; restore $num
	addi	$tp,$sp,$FRAME
	mtctr	$num
	li	$j,0

	subfc.	$ovf,$j,$ovf	; sets XER[CA]
	bne	Lsub
	$UCMP	$hi1,$nj
	bge	Lsub
.align	4
Lcopy:
	$LDX	$tj,$tp,$j
	$STX	$tj,$rp,$j
	$STX	$j,$tp,$j	; zap at once
	addi	$j,$j,$BNSZ
	bdnz-	Lcopy

Lexit:
	$POP	r14,`4*$SIZE_T`($sp)
	$POP	r15,`5*$SIZE_T`($sp)
	$POP	r16,`6*$SIZE_T`($sp)
	$POP	r17,`7*$SIZE_T`($sp)
	$POP	r18,`8*$SIZE_T`($sp)
	$POP	r19,`9*$SIZE_T`($sp)
	$POP	r20,`10*$SIZE_T`($sp)
	$POP	r21,`11*$SIZE_T`($sp)
	$POP	r22,`12*$SIZE_T`($sp)
	$POP	r23,`13*$SIZE_T`($sp)
	$POP	r24,`14*$SIZE_T`($sp)
	$POP	r25,`15*$SIZE_T`($sp)
	$POP	$sp,0($sp)
	li	r3,1
	blr
	.long	0
.align	4
Lsub:	$LDX	$tj,$tp,$j
	$LDX	$nj,$np,$j
	subfe	$tj,$nj,$tj	; tp[j]-np[j]
	$STX	$tj,$rp,$j
	addi	$j,$j,$BNSZ
	bdnz-	Lsub
	li	$j,0
	subfe.	$ovf,$j,$ovf
	mtctr	$num
	bne	Lcopy
.align	4
Lzap:	$STX	$j,$tp,$j
	addi	$j,$j,$BNSZ
	bdnz-	Lzap
	b	Lexit
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;
+113 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl

# PowerPC assembler distiller by <appro>.

my $output = shift;
open STDOUT,">$output" || die "can't open $output: $!";

my $flavour = $output;
my %GLOBALS;
my $dotinlocallabels=0;

################################################################
# directives which need special treatment on different platforms
################################################################
my $globl = sub {
    my $junk = shift;
    my $name = shift;
    my $global = \$GLOBALS{$name};
    my $ret;

    $name =~ s|^[\.\_]||;
 
    SWITCH: for ($flavour) {
	/aix/		&& do { $name = ".$name";
				last;
			      };
	/osx/		&& do { $name = "_$name";
				last;
			      };
	/linux.*32/	&& do {	$ret .= ".globl	$name\n";
				$ret .= ".type	$name,\@function";
				$dotinlocallabels = 1;
				last;
			      };
	/linux.*64/	&& do {	$ret .= ".globl	.$name\n";
				$ret .= ".type	.$name,\@function\n";
				$ret .= ".section	\".opd\",\"aw\"\n";
				$ret .= ".globl	$name\n";
				$ret .= ".align	3\n";
				$ret .= "$name:\n";
				$ret .= ".quad	.$name,.TOC.\@tocbase,0\n";
				$ret .= ".size	$name,24\n";
				$ret .= ".previous\n";

				$name = ".$name";
				$dotinlocallabels = 1;
				last;
			      };
    }

    $ret = ".globl	$name" if (!$ret);
    $$global = $name;
    $ret;
};
my $machine = sub {
    my $junk = shift;
    my $arch = shift;
    $arch = "ppc970" if ($arch eq "any" and $flavour =~ /osx/);
    ".machine	$arch";
};

################################################################
# simplified mnemonics not handled by at least one assembler
################################################################
my $cmplw = sub {
    my $f = shift;
    my $cr = 0; $cr = shift if ($#_>1);
    "	cmpl$f	".join(',',$cr,0,@_);
};
my $cmpld = sub {
    my $f = shift;
    my $cr = 0; $cr = shift if ($#_>1);
    "	cmpl$f	".join(',',$cr,1,@_);
};
my $bdnz = sub {
    my $f = shift;
    my $bo = $f=~/[\+\-]/ ? 17 : 16;
    "	bc	$bo,0,".shift;
};

while($line=<>) {

    $line =~ s|[#!;].*$||;	# get rid of asm-style comments...
    $line =~ s|/\*.*\*/||;	# ... and C-style comments...
    $line =~ s|^\s+||;		# ... and skip white spaces in beginning...
    $line =~ s|\s+$||;		# ... and at the end

    {
	$line =~ s|\b\.L(\w+)|L$1|g;	# common denominator for Locallabel
	$line =~ s|\bL(\w+)|\.L$1|g	if ($dotinlocallabels);
    }

    {
	$line =~ s|(^[\.\w]+)\:\s*||;
	my $label = $1;
	printf "%s:",($GLOBALS{$label} or $label) if ($label);
    }

    {
	$line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||;
	my $c = $1; $c = "\t" if ($c eq "");
	my $mnemonic = $2;
	my $f = $3;
	my $opcode = eval("\$$mnemonic");
	if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
	elsif ($mnemonic)           { $line = $c.$mnemonic.$f."\t".$line; }
    }

    print $line if ($line);
    print "\n";
}

close STDOUT;