fairly happy with this on some random files, want to try doing a phrase db

This commit is contained in:
nate smith 2024-02-01 23:17:26 -08:00
parent c508d152d2
commit f301777fc3

View File

@ -16,56 +16,68 @@ import (
func main() {
phraseMarkers := map[rune]bool{
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
'(': true,
')': true,
'\'': true,
'{': true,
'}': true,
'[': true,
']': true,
'“': true,
'”': true,
'=': true,
'`': true,
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
//'(': true,
')': true,
//'{': true,
'}': true,
//'[': true,
']': true,
//'\'': true,
//'"': true,
//'“': true,
'”': true,
'=': true,
'`': true,
'-': true,
}
s := bufio.NewScanner(os.Stdin)
phraseBuff := []byte{}
printed := false
for s.Scan() {
text := strings.TrimSpace(s.Text())
seenSpace := false
for i, r := range text {
if r == ' ' {
seenSpace = true
}
if ok, val := phraseMarkers[r]; ok && val {
if len(phraseBuff) >= 20 && seenSpace {
// TODO QA check for alphabetic content
fmt.Println(strings.TrimSpace(string(phraseBuff)))
if len(phraseBuff) >= 10 {
cleaned := clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Println(cleaned)
printed = true
}
}
if !printed {
fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
}
printed = false
phraseBuff = []byte{}
} else {
asStr := string(phraseBuff)
if r == ' ' && strings.HasSuffix(asStr, " ") {
continue
}
phraseBuff = append(phraseBuff, byte(r))
if i == len(text)-1 && len(phraseBuff) > 0 && !strings.HasSuffix(asStr, " ") {
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
phraseBuff = append(phraseBuff, byte(' '))
}
phraseBuff = append(phraseBuff, byte(r))
}
}
}
}
func clean(s string) string {
func clean(bs []byte) string {
s := string(bs)
s = strings.ReplaceAll(s, "", "'")
s = strings.ReplaceAll(s, "\"", "")
s = strings.TrimSpace(s)
s = strings.ToLower(s)
// TODO QA check for alphabetism
return s
}