From 79b27a1530135dcf63a0531f7bad69539983039b Mon Sep 17 00:00:00 2001
From: Andrew Ekstedt <andrew.ekstedt@gmail.com>
Date: Sun, 6 Oct 2024 00:40:46 -0700
Subject: [PATCH 1/3] add a few more test vectors for SHA-256

these test some boundary conditions for an optimization i'm about to do.

computed using https://emn178.github.io/online-tools/sha3_256.html
---
 keccak_test.go | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/keccak_test.go b/keccak_test.go
index 3e8d0ae..128a064 100644
--- a/keccak_test.go
+++ b/keccak_test.go
@@ -40,6 +40,31 @@ var tests = []struct {
 		text: "The quick brown fox jumps over the lazy dog",
 		hash: "69070dda01975c8c120c3aada1b282394e7f032fa9cf32f4cb2259a0897dfc04",
 	},
+
+	{
+		f:    New256,
+		name: "SHA3-256",
+		text: "a",
+		hash: "80084bf2fba02475726feb2cab2d8215eab14bc6bdd8bfb2c8151257032ecd8b",
+	},
+	{
+		f:    New256,
+		name: "SHA3-256",
+		text: "abcdefg",
+		hash: "7d55114476dfc6a2fbeaa10e221a8d0f32fc8f2efb69a6e878f4633366917a62",
+	},
+	{
+		f:    New256,
+		name: "SHA3-256",
+		text: "abcdefgh",
+		hash: "3e2020725a38a48eb3bbf75767f03a22c6b3f41f459c831309b06433ec649779",
+	},
+	{
+		f:    New256,
+		name: "SHA3-256",
+		text: "abcdefghi",
+		hash: "f74eb337992307c22bc59eb43e59583a683f3b93077e7f2472508e8c464d2657",
+	},
 }
 
 func TestHash(t *testing.T) {
@@ -58,6 +83,24 @@ func TestHash(t *testing.T) {
 	}
 }
 
+func TestHashSmallWrites(t *testing.T) {
+	for _, tt := range tests {
+		want, err := hex.DecodeString(tt.hash)
+		if err != nil {
+			t.Errorf("%s(%q): %s", tt.name, tt.text, err)
+			continue
+		}
+		h := tt.f()
+		for i := range []byte(tt.text) {
+			io.WriteString(h, tt.text[i:i+1])
+		}
+		got := h.Sum(nil)
+		if !bytes.Equal(got, want) {
+			t.Errorf("%s(%q) = %x, want %x", tt.name, tt.text, got, want)
+		}
+	}
+}
+
 func benchmark(b *testing.B, f func() hash.Hash, size int64) {
 	var tmp [Size * 2]byte
 	var msg [8192]byte

From 70a9bfa87d79c5f6c0b886dc9f8029663f04b831 Mon Sep 17 00:00:00 2001
From: Andrew Ekstedt <andrew.ekstedt@gmail.com>
Date: Sun, 6 Oct 2024 00:46:11 -0700
Subject: [PATCH 2/3] help the bounds checker in le64dec

---
 sponge.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sponge.go b/sponge.go
index aab872d..e1839ac 100644
--- a/sponge.go
+++ b/sponge.go
@@ -91,6 +91,7 @@ func (d *digest) Sum(b []byte) []byte {
 }
 
 func le64dec(b []byte) uint64 {
+	_ = b[7]
 	return uint64(b[0])<<0 | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 

From 73654f751cea700bbb71a1cca0421c59cbe99465 Mon Sep 17 00:00:00 2001
From: Andrew Ekstedt <andrew.ekstedt@gmail.com>
Date: Sun, 6 Oct 2024 00:42:36 -0700
Subject: [PATCH 3/3] reduce buffer size to 8 bytes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

instead of buffering an entire block, buffer only when the input is not
aligned to 8 bytes, and otherwise xor uint64-sized chunks directly into
the state.

the code is a little more complicated but i think it's worth it.
we could eliminate the buffer entirely but that requires either
shenanigans with unsafe, or fiddly code to xor partial uint64s

a caveat is that the implementation now only supports sponge capacities
that are a multiple of 8. that's fine for the standard instantiations
but may restrict unusual applications.

not only does this let us reduce the buffer from 200 bytes to 8,
it also provides a nice speedup

name      old time/op    new time/op    delta
256_8-2     1.45µs ± 0%    1.28µs ± 1%  -11.58%  (p=0.000 n=10+10)
256_1k-2    10.1µs ± 0%     9.3µs ± 0%   -7.67%  (p=0.000 n=10+10)
256_8k-2    75.6µs ± 0%    70.2µs ± 1%   -7.09%  (p=0.000 n=10+10)
512_8-2     1.39µs ± 1%    1.29µs ± 1%   -6.85%  (p=0.000 n=10+10)
512_1k-2    18.7µs ± 0%    17.0µs ± 0%   -8.70%   (p=0.000 n=9+10)
512_8k-2     146µs ± 1%     129µs ± 0%  -11.70%   (p=0.000 n=10+9)

name      old speed      new speed      delta
256_8-2   5.53MB/s ± 0%  6.25MB/s ± 0%  +13.06%  (p=0.000 n=10+10)
256_1k-2   102MB/s ± 0%   110MB/s ± 0%   +8.30%  (p=0.000 n=10+10)
256_8k-2   108MB/s ± 0%   117MB/s ± 1%   +7.64%  (p=0.000 n=10+10)
512_8-2   5.78MB/s ± 1%  6.20MB/s ± 1%   +7.32%  (p=0.000 n=10+10)
512_1k-2  54.9MB/s ± 0%  60.1MB/s ± 0%   +9.53%   (p=0.000 n=9+10)
512_8k-2  56.1MB/s ± 1%  63.5MB/s ± 0%  +13.26%   (p=0.000 n=10+9)
---
 sponge.go | 61 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/sponge.go b/sponge.go
index e1839ac..f1c786d 100644
--- a/sponge.go
+++ b/sponge.go
@@ -11,7 +11,8 @@ func round(a *[25]uint64) { roundGo(a) }
 // digest implements hash.Hash
 type digest struct {
 	a      [25]uint64 // a[y][x][z]
-	buf    [200]byte
+	buf    [8]byte    // buf[0:ulen] holds a partial uint64
+	ulen   int8
 	dsbyte byte
 	len    int
 	size   int
@@ -29,34 +30,46 @@ func (d *digest) BlockSize() int { return 200 - d.size*2 }
 func (d *digest) Reset() {
 	//fmt.Println("resetting")
 	d.a = [25]uint64{}
-	d.buf = [200]byte{}
+	d.buf = [8]byte{}
 	d.len = 0
 }
 
 func (d *digest) Write(b []byte) (int, error) {
 	written := len(b)
-	bs := d.BlockSize()
-	for len(b) > 0 {
-		n := copy(d.buf[d.len:bs], b)
-		d.len += n
+	bs := d.BlockSize() / 8
+	// fill buf first, if non-empty
+	if d.ulen > 0 {
+		n := copy(d.buf[d.ulen:], b)
 		b = b[n:]
+		d.ulen += int8(n)
+		// flush?
+		if int(d.ulen) == len(d.buf) {
+			d.a[d.len] ^= le64dec(d.buf[:])
+			d.len += 1
+			d.ulen = 0
+			if d.len == bs {
+				d.flush()
+			}
+		}
+	}
+	// xor 8-byte chunks into the state
+	for len(b) >= 8 {
+		d.a[d.len] ^= le64dec(b)
+		b = b[8:]
+		d.len += 1
 		if d.len == bs {
 			d.flush()
 		}
+	} // len(b) < 8
+	// store any remaining bytes
+	if len(b) > 0 {
+		d.ulen = int8(copy(d.buf[:], b))
 	}
 	return written, nil
 }
 
 func (d *digest) flush() {
-	//fmt.Printf("Flushing with %d bytes\n", d.len)
-	b := d.buf[:d.len]
-	for i := range d.a {
-		if len(b) == 0 {
-			break
-		}
-		d.a[i] ^= le64dec(b)
-		b = b[8:]
-	}
+	//fmt.Printf("Flushing with %d bytes\n", d.len*8 + int(d.ulen))
 	keccakf(&d.a)
 	d.len = 0
 }
@@ -75,13 +88,19 @@ func (d *digest) clone() *digest {
 
 func (d *digest) Sum(b []byte) []byte {
 	d = d.clone()
-	d.buf[d.len] = d.dsbyte
-	bs := d.BlockSize()
-	for i := d.len + 1; i < bs; i++ {
-		d.buf[i] = 0
+	if d.ulen == 0 {
+		d.a[d.len] ^= uint64(d.dsbyte)
+	} else {
+		d.buf[d.ulen] = d.dsbyte
+		for i := int(d.ulen) + 1; i < len(d.buf); i++ {
+			d.buf[i] = 0
+		}
+		d.a[d.len] ^= le64dec(d.buf[:])
 	}
-	d.buf[bs-1] |= 0x80
-	d.len = bs
+
+	bs := d.BlockSize() / 8
+	d.a[bs-1] |= 0x80 << 56
+	//d.len = bs
 	d.flush()
 
 	for i := 0; i < d.size/8; i++ {