fairly happy with this on some random files, want to try doing a phrase db
This commit is contained in:
parent
c508d152d2
commit
f301777fc3
@ -16,56 +16,68 @@ import (
|
|||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
phraseMarkers := map[rune]bool{
|
phraseMarkers := map[rune]bool{
|
||||||
';': true,
|
';': true,
|
||||||
',': true,
|
',': true,
|
||||||
':': true,
|
':': true,
|
||||||
'.': true,
|
'.': true,
|
||||||
'?': true,
|
'?': true,
|
||||||
'!': true,
|
'!': true,
|
||||||
'(': true,
|
//'(': true,
|
||||||
')': true,
|
')': true,
|
||||||
'\'': true,
|
//'{': true,
|
||||||
'{': true,
|
'}': true,
|
||||||
'}': true,
|
//'[': true,
|
||||||
'[': true,
|
']': true,
|
||||||
']': true,
|
//'\'': true,
|
||||||
'“': true,
|
//'"': true,
|
||||||
'”': true,
|
//'“': true,
|
||||||
'=': true,
|
'”': true,
|
||||||
'`': true,
|
'=': true,
|
||||||
|
'`': true,
|
||||||
|
'-': true,
|
||||||
}
|
}
|
||||||
|
|
||||||
s := bufio.NewScanner(os.Stdin)
|
s := bufio.NewScanner(os.Stdin)
|
||||||
phraseBuff := []byte{}
|
phraseBuff := []byte{}
|
||||||
|
printed := false
|
||||||
for s.Scan() {
|
for s.Scan() {
|
||||||
text := strings.TrimSpace(s.Text())
|
text := strings.TrimSpace(s.Text())
|
||||||
seenSpace := false
|
|
||||||
for i, r := range text {
|
for i, r := range text {
|
||||||
if r == ' ' {
|
|
||||||
seenSpace = true
|
|
||||||
}
|
|
||||||
if ok, val := phraseMarkers[r]; ok && val {
|
if ok, val := phraseMarkers[r]; ok && val {
|
||||||
if len(phraseBuff) >= 20 && seenSpace {
|
if len(phraseBuff) >= 10 {
|
||||||
// TODO QA check for alphabetic content
|
cleaned := clean(phraseBuff)
|
||||||
fmt.Println(strings.TrimSpace(string(phraseBuff)))
|
if len(cleaned) > 0 {
|
||||||
|
fmt.Println(cleaned)
|
||||||
|
printed = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
if !printed {
|
||||||
|
fmt.Fprintf(os.Stderr, "SKIP: %s\n", string(phraseBuff))
|
||||||
|
}
|
||||||
|
printed = false
|
||||||
phraseBuff = []byte{}
|
phraseBuff = []byte{}
|
||||||
} else {
|
} else {
|
||||||
asStr := string(phraseBuff)
|
asStr := string(phraseBuff)
|
||||||
if r == ' ' && strings.HasSuffix(asStr, " ") {
|
if r == ' ' && strings.HasSuffix(asStr, " ") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
phraseBuff = append(phraseBuff, byte(r))
|
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
|
||||||
if i == len(text)-1 && len(phraseBuff) > 0 && !strings.HasSuffix(asStr, " ") {
|
|
||||||
phraseBuff = append(phraseBuff, byte(' '))
|
phraseBuff = append(phraseBuff, byte(' '))
|
||||||
}
|
}
|
||||||
|
phraseBuff = append(phraseBuff, byte(r))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func clean(s string) string {
|
func clean(bs []byte) string {
|
||||||
|
s := string(bs)
|
||||||
s = strings.ReplaceAll(s, "’", "'")
|
s = strings.ReplaceAll(s, "’", "'")
|
||||||
|
s = strings.ReplaceAll(s, "\"", "")
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
s = strings.ToLower(s)
|
||||||
|
|
||||||
|
// TODO QA check for alphabetism
|
||||||
|
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user