reduce buffer size to 8 bytes

instead of buffering an entire block, buffer only when the input is not
aligned to 8 bytes, and otherwise xor uint64-sized chunks directly into
the state.

the code is a little more complicated but i think it's worth it.
we could eliminate the buffer entirely but that requires either
shenanigans with unsafe, or fiddly code to xor partial uint64s

a caveat is that the implementation now only supports sponge capacities
that are a multiple of 8. that's fine for the standard instantiations
but may restrict unusual applications.

not only does this let us reduce the buffer from 200 bytes to 8,
it also provides a nice speedup

name      old time/op    new time/op    delta
256_8-2     1.45µs ± 0%    1.28µs ± 1%  -11.58%  (p=0.000 n=10+10)
256_1k-2    10.1µs ± 0%     9.3µs ± 0%   -7.67%  (p=0.000 n=10+10)
256_8k-2    75.6µs ± 0%    70.2µs ± 1%   -7.09%  (p=0.000 n=10+10)
512_8-2     1.39µs ± 1%    1.29µs ± 1%   -6.85%  (p=0.000 n=10+10)
512_1k-2    18.7µs ± 0%    17.0µs ± 0%   -8.70%   (p=0.000 n=9+10)
512_8k-2     146µs ± 1%     129µs ± 0%  -11.70%   (p=0.000 n=10+9)

name      old speed      new speed      delta
256_8-2   5.53MB/s ± 0%  6.25MB/s ± 0%  +13.06%  (p=0.000 n=10+10)
256_1k-2   102MB/s ± 0%   110MB/s ± 0%   +8.30%  (p=0.000 n=10+10)
256_8k-2   108MB/s ± 0%   117MB/s ± 1%   +7.64%  (p=0.000 n=10+10)
512_8-2   5.78MB/s ± 1%  6.20MB/s ± 1%   +7.32%  (p=0.000 n=10+10)
512_1k-2  54.9MB/s ± 0%  60.1MB/s ± 0%   +9.53%   (p=0.000 n=9+10)
512_8k-2  56.1MB/s ± 1%  63.5MB/s ± 0%  +13.26%   (p=0.000 n=10+9)
shake^2
magical 2024-10-06 00:42:36 -07:00
parent 70a9bfa87d
commit 73654f751c
1 changed files with 40 additions and 21 deletions

View File

@ -11,7 +11,8 @@ func round(a *[25]uint64) { roundGo(a) }
// digest implements hash.Hash // digest implements hash.Hash
type digest struct { type digest struct {
a [25]uint64 // a[y][x][z] a [25]uint64 // a[y][x][z]
buf [200]byte buf [8]byte // buf[0:ulen] holds a partial uint64
ulen int8
dsbyte byte dsbyte byte
len int len int
size int size int
@ -29,34 +30,46 @@ func (d *digest) BlockSize() int { return 200 - d.size*2 }
func (d *digest) Reset() { func (d *digest) Reset() {
//fmt.Println("resetting") //fmt.Println("resetting")
d.a = [25]uint64{} d.a = [25]uint64{}
d.buf = [200]byte{} d.buf = [8]byte{}
d.len = 0 d.len = 0
} }
func (d *digest) Write(b []byte) (int, error) { func (d *digest) Write(b []byte) (int, error) {
written := len(b) written := len(b)
bs := d.BlockSize() bs := d.BlockSize() / 8
for len(b) > 0 { // fill buf first, if non-empty
n := copy(d.buf[d.len:bs], b) if d.ulen > 0 {
d.len += n n := copy(d.buf[d.ulen:], b)
b = b[n:] b = b[n:]
d.ulen += int8(n)
// flush?
if int(d.ulen) == len(d.buf) {
d.a[d.len] ^= le64dec(d.buf[:])
d.len += 1
d.ulen = 0
if d.len == bs {
d.flush()
}
}
}
// xor 8-byte chunks into the state
for len(b) >= 8 {
d.a[d.len] ^= le64dec(b)
b = b[8:]
d.len += 1
if d.len == bs { if d.len == bs {
d.flush() d.flush()
} }
} // len(b) < 8
// store any remaining bytes
if len(b) > 0 {
d.ulen = int8(copy(d.buf[:], b))
} }
return written, nil return written, nil
} }
func (d *digest) flush() { func (d *digest) flush() {
//fmt.Printf("Flushing with %d bytes\n", d.len) //fmt.Printf("Flushing with %d bytes\n", d.len*8 + int(d.ulen))
b := d.buf[:d.len]
for i := range d.a {
if len(b) == 0 {
break
}
d.a[i] ^= le64dec(b)
b = b[8:]
}
keccakf(&d.a) keccakf(&d.a)
d.len = 0 d.len = 0
} }
@ -75,13 +88,19 @@ func (d *digest) clone() *digest {
func (d *digest) Sum(b []byte) []byte { func (d *digest) Sum(b []byte) []byte {
d = d.clone() d = d.clone()
d.buf[d.len] = d.dsbyte if d.ulen == 0 {
bs := d.BlockSize() d.a[d.len] ^= uint64(d.dsbyte)
for i := d.len + 1; i < bs; i++ { } else {
d.buf[i] = 0 d.buf[d.ulen] = d.dsbyte
for i := int(d.ulen) + 1; i < len(d.buf); i++ {
d.buf[i] = 0
}
d.a[d.len] ^= le64dec(d.buf[:])
} }
d.buf[bs-1] |= 0x80
d.len = bs bs := d.BlockSize() / 8
d.a[bs-1] |= 0x80 << 56
//d.len = bs
d.flush() d.flush()
for i := 0; i < d.size/8; i++ { for i := 0; i < d.size/8; i++ {