fairly happy with this on some random files, want to try doing a phrase db

This commit is contained in:
nate smith 2024-02-01 23:17:26 -08:00
parent c508d152d2
commit f301777fc3

View File

@ -22,50 +22,62 @@ func main() {
'.': true, '.': true,
'?': true, '?': true,
'!': true, '!': true,
'(': true, //'(': true,
')': true, ')': true,
'\'': true, //'{': true,
'{': true,
'}': true, '}': true,
'[': true, //'[': true,
']': true, ']': true,
'“': true, //'\'': true,
//'"': true,
//'“': true,
'”': true, '”': true,
'=': true, '=': true,
'`': true, '`': true,
'-': true,
} }
s := bufio.NewScanner(os.Stdin) s := bufio.NewScanner(os.Stdin)
phraseBuff := []byte{} phraseBuff := []byte{}
printed := false
for s.Scan() { for s.Scan() {
text := strings.TrimSpace(s.Text()) text := strings.TrimSpace(s.Text())
seenSpace := false
for i, r := range text { for i, r := range text {
if r == ' ' {
seenSpace = true
}
if ok, val := phraseMarkers[r]; ok && val { if ok, val := phraseMarkers[r]; ok && val {
if len(phraseBuff) >= 20 && seenSpace { if len(phraseBuff) >= 10 {
// TODO QA check for alphabetic content cleaned := clean(phraseBuff)
fmt.Println(strings.TrimSpace(string(phraseBuff))) if len(cleaned) > 0 {
fmt.Println(cleaned)
printed = true
} }
}
if !printed {
fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
}
printed = false
phraseBuff = []byte{} phraseBuff = []byte{}
} else { } else {
asStr := string(phraseBuff) asStr := string(phraseBuff)
if r == ' ' && strings.HasSuffix(asStr, " ") { if r == ' ' && strings.HasSuffix(asStr, " ") {
continue continue
} }
phraseBuff = append(phraseBuff, byte(r)) if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
if i == len(text)-1 && len(phraseBuff) > 0 && !strings.HasSuffix(asStr, " ") {
phraseBuff = append(phraseBuff, byte(' ')) phraseBuff = append(phraseBuff, byte(' '))
} }
phraseBuff = append(phraseBuff, byte(r))
} }
} }
} }
} }
func clean(s string) string { func clean(bs []byte) string {
s := string(bs)
s = strings.ReplaceAll(s, "", "'") s = strings.ReplaceAll(s, "", "'")
s = strings.ReplaceAll(s, "\"", "")
s = strings.TrimSpace(s)
s = strings.ToLower(s)
// TODO QA check for alphabetism
return s return s
} }