split on conjugations and prepositions (some of them)

This commit is contained in:
nate smith 2024-04-21 20:02:48 -07:00
parent b49138d0ad
commit d29a80817d
2 changed files with 134 additions and 1 deletions

View File

@ -14,6 +14,26 @@ import (
"strings" "strings"
) )
func conjPrep(phraseBuff []byte, r rune) int {
if r != ' ' {
return -1
}
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
maxLen := 8 // TODO magic number based on longest suffix
offset := len(phraseBuff) - maxLen
if offset < 0 {
offset = 0
}
end := string(phraseBuff[offset:])
for _, s := range suffices {
if strings.HasSuffix(end, " "+s) {
return len(s)
}
}
return -1
}
func main() { func main() {
phraseMarkers := map[rune]bool{ phraseMarkers := map[rune]bool{
';': true, ';': true,
@ -39,13 +59,34 @@ func main() {
'>': true, '>': true,
} }
// I want to experiment with treating prepositions and conjunctions as phrase
// markers.
// to do this i would need to check the phraseBuff when I check phraseMarkers and then split accordingly
s := bufio.NewScanner(os.Stdin) s := bufio.NewScanner(os.Stdin)
phraseBuff := []byte{} phraseBuff := []byte{}
printed := false printed := false
for s.Scan() { for s.Scan() {
text := strings.TrimSpace(s.Text()) text := strings.TrimSpace(s.Text())
for i, r := range text { for i, r := range text {
if ok, val := phraseMarkers[r]; ok && val { if ok := phraseMarkers[r]; ok {
if len(phraseBuff) >= 10 {
cleaned := clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Println(cleaned)
printed = true
}
}
if !printed {
//fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
}
printed = false
phraseBuff = []byte{}
} else if v := conjPrep(phraseBuff, r); v > 0 {
// TODO erase or keep? starting with erase.
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
// TODO this pasta is copied
if len(phraseBuff) >= 10 { if len(phraseBuff) >= 10 {
cleaned := clean(phraseBuff) cleaned := clean(phraseBuff)
if len(cleaned) > 0 { if len(cleaned) > 0 {

View File

@ -2,6 +2,98 @@ package main
import "testing" import "testing"
func Test_conjPrep(t *testing.T) {
type args struct {
buff []byte
r rune
}
cs := []struct {
name string
args args
expected int
}{
{
name: "empty buffer",
args: args{[]byte(""), ' '},
expected: -1,
},
{
name: "not a space yet",
args: args{[]byte("saccharine juice from"), 'x'},
expected: -1,
},
{
name: "from",
args: args{[]byte("i will eat from"), ' '},
expected: 4,
},
{
name: "no preceding space",
args: args{[]byte("wakkabarblurpfrom"), ' '},
expected: -1,
},
{
name: "however",
args: args{[]byte("my eyes are hollow, however"), ' '},
expected: 7,
},
{
name: "at",
args: args{[]byte("there will be no more joy at"), ' '},
expected: 2,
},
{
name: "but",
args: args{[]byte("i buried him, but"), ' '},
expected: 3,
},
{
name: "yet",
args: args{[]byte("the echoes quited yet"), ' '},
expected: 3,
},
{
name: "though",
args: args{[]byte("my eyes were closed though"), ' '},
expected: 6,
},
{
name: "and",
args: args{[]byte("i raised the torch and"), ' '},
expected: 3,
},
{
name: "to",
args: args{[]byte("thousands more to"), ' '},
expected: 2,
},
{
name: "on",
args: args{[]byte("bringing rain down on"), ' '},
expected: 2,
},
{
name: "no match",
args: args{[]byte("i raised the torch"), ' '},
expected: -1,
},
{
name: "or",
args: args{[]byte("whether good or"), ' '},
expected: 2,
},
}
for _, c := range cs {
t.Run(c.name, func(t *testing.T) {
result := conjPrep(c.args.buff, c.args.r)
if result != c.expected {
t.Errorf("got '%v', expected '%v'", result, c.expected)
}
})
}
}
func Test_isAlpha(t *testing.T) { func Test_isAlpha(t *testing.T) {
cs := []struct { cs := []struct {
arg rune arg rune