Commit b943b7d2 authored by Andy Polyakov's avatar Andy Polyakov
Browse files

md5/asm/md5-[586|x86_64].pl: +15% on Atom.

[MD5 is hardly relevant, just cleaning up repository]
parent 496f2b14
Loading
Loading
Loading
Loading
+8 −9
Original line number Diff line number Diff line
@@ -56,14 +56,14 @@ sub R0
	&lea($a,&DWP($t,$a,$tmp2,1));

	&xor($tmp1,$d); # F function - part 4
	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);

	&add($a,$tmp1);
	&mov($tmp1,&Np($c)) if $pos < 1;	# next tmp1 for R0
	&mov($tmp1,&Np($c)) if $pos == 1;	# next tmp1 for R1

	&rotl($a,$s);

	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
	&mov($tmp1,&Np($c)) if $pos < 1;	# next tmp1 for R0
	&mov($tmp1,&Np($c)) if $pos == 1;	# next tmp1 for R1

	&add($a,$b);
	}
@@ -74,13 +74,12 @@ sub R1

	&comment("R1 $ki");

	&lea($a,&DWP($t,$a,$tmp2,1));

	&xor($tmp1,$b); # G function - part 2
	&and($tmp1,$d); # G function - part 3
	&lea($a,&DWP($t,$a,$tmp2,1));

	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);
	&xor($tmp1,$c);			# G function - part 4
	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);

	&add($a,$tmp1);
	&mov($tmp1,&Np($c)) if $pos < 1;	# G function - part 1
@@ -108,10 +107,10 @@ if (($n & 1) == 0)
	&lea($a,&DWP($t,$a,$tmp2,1));

	&add($a,$tmp1);
	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0));

	&rotl($a,$s);

	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0));
	&mov($tmp1,&Np($c));
	}
else
@@ -120,11 +119,11 @@ else
	# make sure to do 'D' first, not 'B', else we clash with
	# the last add from the previous round.

	&lea($a,&DWP($t,$a,$tmp2,1));

	&add($b,$c);			# MOVED FORWARD
	&xor($tmp1,$d); # H function - part 2

	&lea($a,&DWP($t,$a,$tmp2,1));

	&xor($tmp1,$b); # H function - part 3
	&mov($tmp2,&DWP($xo[$ki+1]*4,$K,"",0)) if ($pos != 2);

+12 −1
Original line number Diff line number Diff line
@@ -47,8 +47,8 @@ sub round2_step
    $code .= " mov	%edx,		%r12d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
    $code .= <<EOF;
	not	%r11d				/* not z */
	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
	and	$x,		%r12d		/* x & z */
	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
	and	$y,		%r11d		/* y & (not z) */
	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
	or	%r11d,		%r12d		/* (y & (not z)) | (x & z) */
@@ -65,6 +65,7 @@ EOF
#   %r10d = X[k_next]
#   %r11d = y' (copy of y for the next step)
# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
{ my $round3_alter=0;
sub round3_step
{
    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
@@ -75,10 +76,20 @@ sub round3_step
	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
	xor	$x,		%r11d		/* x ^ ... */
	add	%r11d,		$dst		/* dst += ... */
EOF
    $code .= <<EOF if ($round3_alter);
	rol	\$$s,		$dst		/* dst <<< s */
	mov	$x,		%r11d		/* (NEXT STEP) y' = $x */
EOF
    $code .= <<EOF if (!$round3_alter);
	mov	$x,		%r11d		/* (NEXT STEP) y' = $x */
	rol	\$$s,		$dst		/* dst <<< s */
EOF
    $code .= <<EOF;
	add	$x,		$dst		/* dst += x */
EOF
    $round3_alter^=1;
}
}

# round4_step() does: