cleanup
This commit is contained in:
parent
f55e1482d2
commit
000946c175
152
cutup/cutup.go
152
cutup/cutup.go
@ -2,48 +2,17 @@ package cutup
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"crypto/sha1"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"github.com/vilmibm/trunkless/db"
|
||||
)
|
||||
|
||||
const (
|
||||
srcDir = "/home/vilmibm/pg_plaintext/files"
|
||||
tgtDir = "/home/vilmibm/pg_plaintext/cutup"
|
||||
workers = 10
|
||||
)
|
||||
|
||||
// TODO configurable src/tgt dir
|
||||
// TODO generalize so it's not gutenberg specific
|
||||
|
||||
func worker(paths <-chan string, sources chan<- string) {
|
||||
// TODO generalize to n character phrase markers, write new function
|
||||
phraseMarkers := map[rune]bool{
|
||||
';': true,
|
||||
',': true,
|
||||
':': true,
|
||||
'.': true,
|
||||
'?': true,
|
||||
'!': true,
|
||||
//'(': true,
|
||||
')': true,
|
||||
//'{': true,
|
||||
'}': true,
|
||||
//'[': true,
|
||||
']': true,
|
||||
//'\'': true,
|
||||
//'"': true,
|
||||
//'“': true,
|
||||
'”': true,
|
||||
'=': true,
|
||||
'`': true,
|
||||
'-': true,
|
||||
'|': true,
|
||||
'>': true,
|
||||
}
|
||||
|
||||
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
|
||||
for p := range paths {
|
||||
f, err := os.Open(p)
|
||||
if err != nil {
|
||||
@ -59,7 +28,6 @@ func worker(paths <-chan string, sources chan<- string) {
|
||||
|
||||
var of *os.File
|
||||
var cleaned string
|
||||
var ok bool
|
||||
var asStr string
|
||||
var text string
|
||||
var prefix string
|
||||
@ -85,28 +53,17 @@ func worker(paths <-chan string, sources chan<- string) {
|
||||
break
|
||||
}
|
||||
if sourceid == "" {
|
||||
sourceid = fmt.Sprintf("%x", sha1.Sum([]byte(title)))[0:6]
|
||||
sourceid = db.StrToID(title)
|
||||
prefix = sourceid + "\t"
|
||||
of, err = os.Create(path.Join(tgtDir, sourceid))
|
||||
of, err = os.Create(path.Join(opts.CutupDir, sourceid))
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error())
|
||||
break
|
||||
}
|
||||
}
|
||||
for i, r := range text {
|
||||
if ok = phraseMarkers[r]; ok {
|
||||
if len(phraseBuff) >= 10 {
|
||||
cleaned = clean(phraseBuff)
|
||||
if len(cleaned) > 0 {
|
||||
fmt.Fprintln(of, prefix+cleaned)
|
||||
written++
|
||||
}
|
||||
}
|
||||
phraseBuff = []byte{}
|
||||
} else if v := conjPrep(phraseBuff, r); v > 0 {
|
||||
// TODO erase or keep? starting with erase.
|
||||
if v := shouldBreak(phraseBuff, r); v > 0 {
|
||||
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
|
||||
// TODO this pasta is copied
|
||||
if len(phraseBuff) >= 10 {
|
||||
cleaned = clean(phraseBuff)
|
||||
if len(cleaned) > 0 {
|
||||
@ -138,14 +95,38 @@ func worker(paths <-chan string, sources chan<- string) {
|
||||
}
|
||||
}
|
||||
|
||||
func conjPrep(phraseBuff []byte, r rune) int {
|
||||
var phraseMarkers = map[rune]bool{
|
||||
';': true,
|
||||
',': true,
|
||||
':': true,
|
||||
'.': true,
|
||||
'?': true,
|
||||
'!': true,
|
||||
')': true,
|
||||
'}': true,
|
||||
']': true,
|
||||
'”': true,
|
||||
'=': true,
|
||||
'`': true,
|
||||
'-': true,
|
||||
'|': true,
|
||||
'>': true,
|
||||
}
|
||||
|
||||
var suffices = []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
|
||||
|
||||
const maxSuffixLen = 8 // magic number based on longest suffix
|
||||
|
||||
func shouldBreak(phraseBuff []byte, r rune) int {
|
||||
if ok := phraseMarkers[r]; ok {
|
||||
return 1
|
||||
}
|
||||
|
||||
if r != ' ' {
|
||||
return -1
|
||||
}
|
||||
|
||||
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
|
||||
maxLen := 8 // TODO magic number based on longest suffix
|
||||
offset := len(phraseBuff) - maxLen
|
||||
offset := len(phraseBuff) - maxSuffixLen
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
@ -155,76 +136,17 @@ func conjPrep(phraseBuff []byte, r rune) int {
|
||||
return len(s)
|
||||
}
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
func isAlpha(r rune) bool {
|
||||
// TODO use rune numerical ranges for this
|
||||
switch strings.ToLower(string(r)) {
|
||||
case "a":
|
||||
return true
|
||||
case "b":
|
||||
return true
|
||||
case "c":
|
||||
return true
|
||||
case "d":
|
||||
return true
|
||||
case "e":
|
||||
return true
|
||||
case "f":
|
||||
return true
|
||||
case "g":
|
||||
return true
|
||||
case "h":
|
||||
return true
|
||||
case "i":
|
||||
return true
|
||||
case "j":
|
||||
return true
|
||||
case "k":
|
||||
return true
|
||||
case "l":
|
||||
return true
|
||||
case "m":
|
||||
return true
|
||||
case "n":
|
||||
return true
|
||||
case "o":
|
||||
return true
|
||||
case "p":
|
||||
return true
|
||||
case "q":
|
||||
return true
|
||||
case "r":
|
||||
return true
|
||||
case "s":
|
||||
return true
|
||||
case "t":
|
||||
return true
|
||||
case "u":
|
||||
return true
|
||||
case "v":
|
||||
return true
|
||||
case "w":
|
||||
return true
|
||||
case "x":
|
||||
return true
|
||||
case "y":
|
||||
return true
|
||||
case "z":
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func alphaPercent(s string) float64 {
|
||||
total := 0.0
|
||||
alpha := 0.0
|
||||
|
||||
for _, r := range s {
|
||||
total++
|
||||
if isAlpha(r) {
|
||||
if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') {
|
||||
alpha++
|
||||
}
|
||||
}
|
||||
@ -321,7 +243,7 @@ func Cutup(opts CutupOpts) error {
|
||||
sources := make(chan string, len(entries))
|
||||
|
||||
for x := 0; x < opts.NumWorkers; x++ {
|
||||
go worker(paths, sources)
|
||||
go worker(opts, paths, sources)
|
||||
}
|
||||
|
||||
for _, e := range entries {
|
||||
|
@ -2,7 +2,7 @@ package cutup
|
||||
|
||||
import "testing"
|
||||
|
||||
func Test_conjPrep(t *testing.T) {
|
||||
func Test_shouldBreak(t *testing.T) {
|
||||
type args struct {
|
||||
buff []byte
|
||||
r rune
|
||||
@ -82,61 +82,12 @@ func Test_conjPrep(t *testing.T) {
|
||||
args: args{[]byte("whether good or"), ' '},
|
||||
expected: 2,
|
||||
},
|
||||
// TODO test phrasemarkers
|
||||
}
|
||||
|
||||
for _, c := range cs {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
result := conjPrep(c.args.buff, c.args.r)
|
||||
if result != c.expected {
|
||||
t.Errorf("got '%v', expected '%v'", result, c.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_isAlpha(t *testing.T) {
|
||||
cs := []struct {
|
||||
arg rune
|
||||
expected bool
|
||||
}{
|
||||
{arg: 'a', expected: true},
|
||||
{arg: 'b', expected: true},
|
||||
{arg: 'c', expected: true},
|
||||
{arg: 'd', expected: true},
|
||||
{arg: 'e', expected: true},
|
||||
{arg: 'f', expected: true},
|
||||
{arg: 'g', expected: true},
|
||||
{arg: 'h', expected: true},
|
||||
{arg: 'i', expected: true},
|
||||
{arg: 'j', expected: true},
|
||||
{arg: 'k', expected: true},
|
||||
{arg: 'l', expected: true},
|
||||
{arg: 'm', expected: true},
|
||||
{arg: 'n', expected: true},
|
||||
{arg: 'o', expected: true},
|
||||
{arg: 'p', expected: true},
|
||||
{arg: 'q', expected: true},
|
||||
{arg: 'r', expected: true},
|
||||
{arg: 's', expected: true},
|
||||
{arg: 't', expected: true},
|
||||
{arg: 'u', expected: true},
|
||||
{arg: 'v', expected: true},
|
||||
{arg: 'w', expected: true},
|
||||
{arg: 'x', expected: true},
|
||||
{arg: 'y', expected: true},
|
||||
{arg: 'z', expected: true},
|
||||
{arg: '1'},
|
||||
{arg: '2'},
|
||||
{arg: '3'},
|
||||
{arg: '\''},
|
||||
{arg: '"'},
|
||||
{arg: '#'},
|
||||
{arg: '%'},
|
||||
}
|
||||
|
||||
for _, c := range cs {
|
||||
t.Run(string(c.arg), func(t *testing.T) {
|
||||
result := isAlpha(c.arg)
|
||||
result := shouldBreak(c.args.buff, c.args.r)
|
||||
if result != c.expected {
|
||||
t.Errorf("got '%v', expected '%v'", result, c.expected)
|
||||
}
|
||||
@ -150,7 +101,11 @@ func Test_alphaPercent(t *testing.T) {
|
||||
expected float64
|
||||
}{
|
||||
{
|
||||
arg: "abcd",
|
||||
arg: "abcdefghijklmnopqrstuvwxyz",
|
||||
expected: 100.0,
|
||||
},
|
||||
{
|
||||
arg: "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
|
||||
expected: 100.0,
|
||||
},
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user