trunkless/cutup/cutup.go
2024-04-21 21:46:16 -07:00

176 lines
3.4 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package cutup
import (
"bufio"
"fmt"
"io"
"strings"
)
func conjPrep(phraseBuff []byte, r rune) int {
if r != ' ' {
return -1
}
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
maxLen := 8 // TODO magic number based on longest suffix
offset := len(phraseBuff) - maxLen
if offset < 0 {
offset = 0
}
end := string(phraseBuff[offset:])
for _, s := range suffices {
if strings.HasSuffix(end, " "+s) {
return len(s)
}
}
return -1
}
func Cutup(ins io.Reader) {
phraseMarkers := map[rune]bool{
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
//'(': true,
')': true,
//'{': true,
'}': true,
//'[': true,
']': true,
//'\'': true,
//'"': true,
//'“': true,
'”': true,
'=': true,
'`': true,
'-': true,
'|': true,
'>': true,
}
// I want to experiment with treating prepositions and conjunctions as phrase
// markers.
// to do this i would need to check the phraseBuff when I check phraseMarkers and then split accordingly
s := bufio.NewScanner(ins)
phraseBuff := []byte{}
printed := false
for s.Scan() {
text := strings.TrimSpace(s.Text())
for i, r := range text {
if ok := phraseMarkers[r]; ok {
if len(phraseBuff) >= 10 {
cleaned := clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Println(cleaned)
printed = true
}
}
if !printed {
//fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
}
printed = false
phraseBuff = []byte{}
} else if v := conjPrep(phraseBuff, r); v > 0 {
// TODO erase or keep? starting with erase.
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
// TODO this pasta is copied
if len(phraseBuff) >= 10 {
cleaned := clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Println(cleaned)
printed = true
}
}
if !printed {
//fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
}
printed = false
phraseBuff = []byte{}
} else {
asStr := string(phraseBuff)
if r == ' ' && strings.HasSuffix(asStr, " ") {
continue
}
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
phraseBuff = append(phraseBuff, byte(' '))
}
phraseBuff = append(phraseBuff, byte(r))
}
}
}
}
func isAlpha(r rune) bool {
alphaChars := map[rune]bool{
'a': true,
'b': true,
'c': true,
'd': true,
'e': true,
'f': true,
'g': true,
'h': true,
'i': true,
'j': true,
'k': true,
'l': true,
'm': true,
'n': true,
'o': true,
'p': true,
'q': true,
'r': true,
's': true,
't': true,
'u': true,
'v': true,
'w': true,
'x': true,
'y': true,
'z': true,
}
lookup := strings.ToLower(string(r))
return alphaChars[rune(lookup[0])]
}
func alphaPercent(s string) float64 {
total := 0.0
alpha := 0.0
for _, r := range s {
total++
if isAlpha(r) {
alpha++
}
}
return 100 * (alpha / total)
}
func clean(bs []byte) string {
s := string(bs)
s = strings.ReplaceAll(s, "", "'")
s = strings.ReplaceAll(s, "\"", "")
s = strings.ReplaceAll(s, "(", "")
s = strings.ReplaceAll(s, "[", "")
s = strings.ReplaceAll(s, "{", "")
s = strings.ReplaceAll(s, "<", "")
s = strings.ReplaceAll(s, "_", "")
s = strings.ReplaceAll(s, "*", "")
s = strings.TrimLeft(s, "'\"")
s = strings.TrimSpace(s)
s = strings.ToLower(s)
if alphaPercent(s) < 50.0 {
return ""
}
return s
}