reduce buffer size to 8 bytes

instead of buffering an entire block, buffer only when the input is not
aligned to 8 bytes, and otherwise xor uint64-sized chunks directly into
the state.

the code is a little more complicated but i think it's worth it.
we could eliminate the buffer entirely but that requires either
shenanigans with unsafe, or fiddly code to xor partial uint64s

a caveat is that the implementation now only supports sponge capacities
that are a multiple of 8. that's fine for the standard instantiations
but may restrict unusual applications.

not only does this let us reduce the buffer from 200 bytes to 8,
it also provides a nice speedup

name      old time/op    new time/op    delta
256_8-2     1.45µs ± 0%    1.28µs ± 1%  -11.58%  (p=0.000 n=10+10)
256_1k-2    10.1µs ± 0%     9.3µs ± 0%   -7.67%  (p=0.000 n=10+10)
256_8k-2    75.6µs ± 0%    70.2µs ± 1%   -7.09%  (p=0.000 n=10+10)
512_8-2     1.39µs ± 1%    1.29µs ± 1%   -6.85%  (p=0.000 n=10+10)
512_1k-2    18.7µs ± 0%    17.0µs ± 0%   -8.70%   (p=0.000 n=9+10)
512_8k-2     146µs ± 1%     129µs ± 0%  -11.70%   (p=0.000 n=10+9)

name      old speed      new speed      delta
256_8-2   5.53MB/s ± 0%  6.25MB/s ± 0%  +13.06%  (p=0.000 n=10+10)
256_1k-2   102MB/s ± 0%   110MB/s ± 0%   +8.30%  (p=0.000 n=10+10)
256_8k-2   108MB/s ± 0%   117MB/s ± 1%   +7.64%  (p=0.000 n=10+10)
512_8-2   5.78MB/s ± 1%  6.20MB/s ± 1%   +7.32%  (p=0.000 n=10+10)
512_1k-2  54.9MB/s ± 0%  60.1MB/s ± 0%   +9.53%   (p=0.000 n=9+10)
512_8k-2  56.1MB/s ± 1%  63.5MB/s ± 0%  +13.26%   (p=0.000 n=10+9)
shake^2
magical 2024-10-06 00:42:36 -07:00
parent 70a9bfa87d
commit 73654f751c
1 changed files with 40 additions and 21 deletions

View File

@ -11,7 +11,8 @@ func round(a *[25]uint64) { roundGo(a) }
// digest implements hash.Hash
type digest struct {
a [25]uint64 // a[y][x][z]
buf [200]byte
buf [8]byte // buf[0:ulen] holds a partial uint64
ulen int8
dsbyte byte
len int
size int
@ -29,34 +30,46 @@ func (d *digest) BlockSize() int { return 200 - d.size*2 }
func (d *digest) Reset() {
//fmt.Println("resetting")
d.a = [25]uint64{}
d.buf = [200]byte{}
d.buf = [8]byte{}
d.len = 0
}
func (d *digest) Write(b []byte) (int, error) {
written := len(b)
bs := d.BlockSize()
for len(b) > 0 {
n := copy(d.buf[d.len:bs], b)
d.len += n
bs := d.BlockSize() / 8
// fill buf first, if non-empty
if d.ulen > 0 {
n := copy(d.buf[d.ulen:], b)
b = b[n:]
d.ulen += int8(n)
// flush?
if int(d.ulen) == len(d.buf) {
d.a[d.len] ^= le64dec(d.buf[:])
d.len += 1
d.ulen = 0
if d.len == bs {
d.flush()
}
}
}
// xor 8-byte chunks into the state
for len(b) >= 8 {
d.a[d.len] ^= le64dec(b)
b = b[8:]
d.len += 1
if d.len == bs {
d.flush()
}
} // len(b) < 8
// store any remaining bytes
if len(b) > 0 {
d.ulen = int8(copy(d.buf[:], b))
}
return written, nil
}
func (d *digest) flush() {
//fmt.Printf("Flushing with %d bytes\n", d.len)
b := d.buf[:d.len]
for i := range d.a {
if len(b) == 0 {
break
}
d.a[i] ^= le64dec(b)
b = b[8:]
}
//fmt.Printf("Flushing with %d bytes\n", d.len*8 + int(d.ulen))
keccakf(&d.a)
d.len = 0
}
@ -75,13 +88,19 @@ func (d *digest) clone() *digest {
func (d *digest) Sum(b []byte) []byte {
d = d.clone()
d.buf[d.len] = d.dsbyte
bs := d.BlockSize()
for i := d.len + 1; i < bs; i++ {
d.buf[i] = 0
if d.ulen == 0 {
d.a[d.len] ^= uint64(d.dsbyte)
} else {
d.buf[d.ulen] = d.dsbyte
for i := int(d.ulen) + 1; i < len(d.buf); i++ {
d.buf[i] = 0
}
d.a[d.len] ^= le64dec(d.buf[:])
}
d.buf[bs-1] |= 0x80
d.len = bs
bs := d.BlockSize() / 8
d.a[bs-1] |= 0x80 << 56
//d.len = bs
d.flush()
for i := 0; i < d.size/8; i++ {