From 000946c1757bca7facfbb2fb8d618653a6d0130f Mon Sep 17 00:00:00 2001 From: nate smith Date: Sun, 28 Apr 2024 21:13:12 -0700 Subject: [PATCH] cleanup --- cutup/cutup.go | 152 +++++++++++--------------------------------- cutup/cutup_test.go | 61 +++--------------- 2 files changed, 45 insertions(+), 168 deletions(-) diff --git a/cutup/cutup.go b/cutup/cutup.go index 3e1b799..7d9ad44 100644 --- a/cutup/cutup.go +++ b/cutup/cutup.go @@ -2,48 +2,17 @@ package cutup import ( "bufio" - "crypto/sha1" "fmt" "os" "path" "strings" + + "github.com/vilmibm/trunkless/db" ) -const ( - srcDir = "/home/vilmibm/pg_plaintext/files" - tgtDir = "/home/vilmibm/pg_plaintext/cutup" - workers = 10 -) - -// TODO configurable src/tgt dir // TODO generalize so it's not gutenberg specific -func worker(paths <-chan string, sources chan<- string) { - // TODO generalize to n character phrase markers, write new function - phraseMarkers := map[rune]bool{ - ';': true, - ',': true, - ':': true, - '.': true, - '?': true, - '!': true, - //'(': true, - ')': true, - //'{': true, - '}': true, - //'[': true, - ']': true, - //'\'': true, - //'"': true, - //'“': true, - '”': true, - '=': true, - '`': true, - '-': true, - '|': true, - '>': true, - } - +func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { for p := range paths { f, err := os.Open(p) if err != nil { @@ -59,7 +28,6 @@ func worker(paths <-chan string, sources chan<- string) { var of *os.File var cleaned string - var ok bool var asStr string var text string var prefix string @@ -85,28 +53,17 @@ func worker(paths <-chan string, sources chan<- string) { break } if sourceid == "" { - sourceid = fmt.Sprintf("%x", sha1.Sum([]byte(title)))[0:6] + sourceid = db.StrToID(title) prefix = sourceid + "\t" - of, err = os.Create(path.Join(tgtDir, sourceid)) + of, err = os.Create(path.Join(opts.CutupDir, sourceid)) if err != nil { fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error()) break } } for i, r := range text { - if ok = phraseMarkers[r]; ok { - if len(phraseBuff) >= 10 { - cleaned = clean(phraseBuff) - if len(cleaned) > 0 { - fmt.Fprintln(of, prefix+cleaned) - written++ - } - } - phraseBuff = []byte{} - } else if v := conjPrep(phraseBuff, r); v > 0 { - // TODO erase or keep? starting with erase. + if v := shouldBreak(phraseBuff, r); v > 0 { phraseBuff = phraseBuff[0 : len(phraseBuff)-v] - // TODO this pasta is copied if len(phraseBuff) >= 10 { cleaned = clean(phraseBuff) if len(cleaned) > 0 { @@ -138,14 +95,38 @@ func worker(paths <-chan string, sources chan<- string) { } } -func conjPrep(phraseBuff []byte, r rune) int { +var phraseMarkers = map[rune]bool{ + ';': true, + ',': true, + ':': true, + '.': true, + '?': true, + '!': true, + ')': true, + '}': true, + ']': true, + '”': true, + '=': true, + '`': true, + '-': true, + '|': true, + '>': true, +} + +var suffices = []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"} + +const maxSuffixLen = 8 // magic number based on longest suffix + +func shouldBreak(phraseBuff []byte, r rune) int { + if ok := phraseMarkers[r]; ok { + return 1 + } + if r != ' ' { return -1 } - suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"} - maxLen := 8 // TODO magic number based on longest suffix - offset := len(phraseBuff) - maxLen + offset := len(phraseBuff) - maxSuffixLen if offset < 0 { offset = 0 } @@ -155,76 +136,17 @@ func conjPrep(phraseBuff []byte, r rune) int { return len(s) } } + return -1 } -func isAlpha(r rune) bool { - // TODO use rune numerical ranges for this - switch strings.ToLower(string(r)) { - case "a": - return true - case "b": - return true - case "c": - return true - case "d": - return true - case "e": - return true - case "f": - return true - case "g": - return true - case "h": - return true - case "i": - return true - case "j": - return true - case "k": - return true - case "l": - return true - case "m": - return true - case "n": - return true - case "o": - return true - case "p": - return true - case "q": - return true - case "r": - return true - case "s": - return true - case "t": - return true - case "u": - return true - case "v": - return true - case "w": - return true - case "x": - return true - case "y": - return true - case "z": - return true - } - - return false -} - func alphaPercent(s string) float64 { total := 0.0 alpha := 0.0 for _, r := range s { total++ - if isAlpha(r) { + if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') { alpha++ } } @@ -321,7 +243,7 @@ func Cutup(opts CutupOpts) error { sources := make(chan string, len(entries)) for x := 0; x < opts.NumWorkers; x++ { - go worker(paths, sources) + go worker(opts, paths, sources) } for _, e := range entries { diff --git a/cutup/cutup_test.go b/cutup/cutup_test.go index 961a6eb..ce2700c 100644 --- a/cutup/cutup_test.go +++ b/cutup/cutup_test.go @@ -2,7 +2,7 @@ package cutup import "testing" -func Test_conjPrep(t *testing.T) { +func Test_shouldBreak(t *testing.T) { type args struct { buff []byte r rune @@ -82,61 +82,12 @@ func Test_conjPrep(t *testing.T) { args: args{[]byte("whether good or"), ' '}, expected: 2, }, + // TODO test phrasemarkers } for _, c := range cs { t.Run(c.name, func(t *testing.T) { - result := conjPrep(c.args.buff, c.args.r) - if result != c.expected { - t.Errorf("got '%v', expected '%v'", result, c.expected) - } - }) - } -} - -func Test_isAlpha(t *testing.T) { - cs := []struct { - arg rune - expected bool - }{ - {arg: 'a', expected: true}, - {arg: 'b', expected: true}, - {arg: 'c', expected: true}, - {arg: 'd', expected: true}, - {arg: 'e', expected: true}, - {arg: 'f', expected: true}, - {arg: 'g', expected: true}, - {arg: 'h', expected: true}, - {arg: 'i', expected: true}, - {arg: 'j', expected: true}, - {arg: 'k', expected: true}, - {arg: 'l', expected: true}, - {arg: 'm', expected: true}, - {arg: 'n', expected: true}, - {arg: 'o', expected: true}, - {arg: 'p', expected: true}, - {arg: 'q', expected: true}, - {arg: 'r', expected: true}, - {arg: 's', expected: true}, - {arg: 't', expected: true}, - {arg: 'u', expected: true}, - {arg: 'v', expected: true}, - {arg: 'w', expected: true}, - {arg: 'x', expected: true}, - {arg: 'y', expected: true}, - {arg: 'z', expected: true}, - {arg: '1'}, - {arg: '2'}, - {arg: '3'}, - {arg: '\''}, - {arg: '"'}, - {arg: '#'}, - {arg: '%'}, - } - - for _, c := range cs { - t.Run(string(c.arg), func(t *testing.T) { - result := isAlpha(c.arg) + result := shouldBreak(c.args.buff, c.args.r) if result != c.expected { t.Errorf("got '%v', expected '%v'", result, c.expected) } @@ -150,7 +101,11 @@ func Test_alphaPercent(t *testing.T) { expected float64 }{ { - arg: "abcd", + arg: "abcdefghijklmnopqrstuvwxyz", + expected: 100.0, + }, + { + arg: "ABCDEFGHIJKLMNOPQRSTUVWXYZ", expected: 100.0, }, {