cleanup

2024-04-28 21:13:12 -07:00 · 2024-04-28 21:13:12 -07:00 · 000946c175
commit 000946c175
parent f55e1482d2
2 changed files with 45 additions and 168 deletions
--- a/cutup/cutup.go
+++ b/cutup/cutup.go
@ -2,48 +2,17 @@ package cutup
 import (
 	"bufio"
 	"crypto/sha1"
 	"fmt"
 	"os"
 	"path"
 	"strings"
 	"github.com/vilmibm/trunkless/db"
 )
 const (
 	srcDir  = "/home/vilmibm/pg_plaintext/files"
 	tgtDir  = "/home/vilmibm/pg_plaintext/cutup"
 	workers = 10
 )
 // TODO configurable src/tgt dir
 // TODO generalize so it's not gutenberg specific
-func worker(paths <-chan string, sources chan<- string) {
+func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
 	// TODO generalize to n character phrase markers, write new function
 	phraseMarkers := map[rune]bool{
 		';': true,
 		',': true,
 		':': true,
 		'.': true,
 		'?': true,
 		'!': true,
 		//'(':  true,
 		')': true,
 		//'{':  true,
 		'}': true,
 		//'[':  true,
 		']': true,
 		//'\'': true,
 		//'"':  true,
 		//'“':  true,
 		'”': true,
 		'=': true,
 		'`': true,
 		'-': true,
 		'|': true,
 		'>': true,
 	}
 	for p := range paths {
 		f, err := os.Open(p)
 		if err != nil {
@ -59,7 +28,6 @@ func worker(paths <-chan string, sources chan<- string) {
 		var of *os.File
 		var cleaned string
 		var ok bool
 		var asStr string
 		var text string
 		var prefix string
@ -85,28 +53,17 @@ func worker(paths <-chan string, sources chan<- string) {
 				break
 			}
 			if sourceid == "" {
-				sourceid = fmt.Sprintf("%x", sha1.Sum([]byte(title)))[0:6]
+				sourceid = db.StrToID(title)
 				prefix = sourceid + "\t"
-				of, err = os.Create(path.Join(tgtDir, sourceid))
+				of, err = os.Create(path.Join(opts.CutupDir, sourceid))
 				if err != nil {
 					fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error())
 					break
 				}
 			}
 			for i, r := range text {
-				if ok = phraseMarkers[r]; ok {
+				if v := shouldBreak(phraseBuff, r); v > 0 {
 					if len(phraseBuff) >= 10 {
 						cleaned = clean(phraseBuff)
 						if len(cleaned) > 0 {
 							fmt.Fprintln(of, prefix+cleaned)
 							written++
 						}
 					}
 					phraseBuff = []byte{}
 				} else if v := conjPrep(phraseBuff, r); v > 0 {
 					// TODO erase or keep? starting with erase.
 					phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
 					// TODO this pasta is copied
 					if len(phraseBuff) >= 10 {
 						cleaned = clean(phraseBuff)
 						if len(cleaned) > 0 {
@ -138,14 +95,38 @@ func worker(paths <-chan string, sources chan<- string) {
 	}
 }
-func conjPrep(phraseBuff []byte, r rune) int {
+var phraseMarkers = map[rune]bool{
 	';': true,
 	',': true,
 	':': true,
 	'.': true,
 	'?': true,
 	'!': true,
 	')': true,
 	'}': true,
 	']': true,
 	'”': true,
 	'=': true,
 	'`': true,
 	'-': true,
 	'|': true,
 	'>': true,
 }
 var suffices = []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
 const maxSuffixLen = 8 // magic number based on longest suffix
 func shouldBreak(phraseBuff []byte, r rune) int {
 	if ok := phraseMarkers[r]; ok {
 		return 1
 	}
 	if r != ' ' {
 		return -1
 	}
-	suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
+	offset := len(phraseBuff) - maxSuffixLen
 	maxLen := 8 // TODO magic number based on longest suffix
 	offset := len(phraseBuff) - maxLen
 	if offset < 0 {
 		offset = 0
 	}
@ -155,76 +136,17 @@ func conjPrep(phraseBuff []byte, r rune) int {
 			return len(s)
 		}
 	}
 	return -1
 }
 func isAlpha(r rune) bool {
 	// TODO use rune numerical ranges for this
 	switch strings.ToLower(string(r)) {
 	case "a":
 		return true
 	case "b":
 		return true
 	case "c":
 		return true
 	case "d":
 		return true
 	case "e":
 		return true
 	case "f":
 		return true
 	case "g":
 		return true
 	case "h":
 		return true
 	case "i":
 		return true
 	case "j":
 		return true
 	case "k":
 		return true
 	case "l":
 		return true
 	case "m":
 		return true
 	case "n":
 		return true
 	case "o":
 		return true
 	case "p":
 		return true
 	case "q":
 		return true
 	case "r":
 		return true
 	case "s":
 		return true
 	case "t":
 		return true
 	case "u":
 		return true
 	case "v":
 		return true
 	case "w":
 		return true
 	case "x":
 		return true
 	case "y":
 		return true
 	case "z":
 		return true
 	}
 	return false
 }
 func alphaPercent(s string) float64 {
 	total := 0.0
 	alpha := 0.0
 	for _, r := range s {
 		total++
-		if isAlpha(r) {
+		if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') {
 			alpha++
 		}
 	}
@ -321,7 +243,7 @@ func Cutup(opts CutupOpts) error {
 	sources := make(chan string, len(entries))
 	for x := 0; x < opts.NumWorkers; x++ {
-		go worker(paths, sources)
+		go worker(opts, paths, sources)
 	}
 	for _, e := range entries {
--- a/cutup/cutup_test.go
+++ b/cutup/cutup_test.go
@ -2,7 +2,7 @@ package cutup
 import "testing"
-func Test_conjPrep(t *testing.T) {
+func Test_shouldBreak(t *testing.T) {
 	type args struct {
 		buff []byte
 		r    rune
@ -82,61 +82,12 @@ func Test_conjPrep(t *testing.T) {
 			args:     args{[]byte("whether good or"), ' '},
 			expected: 2,
 		},
 		// TODO test phrasemarkers
 	}
 	for _, c := range cs {
 		t.Run(c.name, func(t *testing.T) {
-			result := conjPrep(c.args.buff, c.args.r)
+			result := shouldBreak(c.args.buff, c.args.r)
 			if result != c.expected {
 				t.Errorf("got '%v', expected '%v'", result, c.expected)
 			}
 		})
 	}
 }
 func Test_isAlpha(t *testing.T) {
 	cs := []struct {
 		arg      rune
 		expected bool
 	}{
 		{arg: 'a', expected: true},
 		{arg: 'b', expected: true},
 		{arg: 'c', expected: true},
 		{arg: 'd', expected: true},
 		{arg: 'e', expected: true},
 		{arg: 'f', expected: true},
 		{arg: 'g', expected: true},
 		{arg: 'h', expected: true},
 		{arg: 'i', expected: true},
 		{arg: 'j', expected: true},
 		{arg: 'k', expected: true},
 		{arg: 'l', expected: true},
 		{arg: 'm', expected: true},
 		{arg: 'n', expected: true},
 		{arg: 'o', expected: true},
 		{arg: 'p', expected: true},
 		{arg: 'q', expected: true},
 		{arg: 'r', expected: true},
 		{arg: 's', expected: true},
 		{arg: 't', expected: true},
 		{arg: 'u', expected: true},
 		{arg: 'v', expected: true},
 		{arg: 'w', expected: true},
 		{arg: 'x', expected: true},
 		{arg: 'y', expected: true},
 		{arg: 'z', expected: true},
 		{arg: '1'},
 		{arg: '2'},
 		{arg: '3'},
 		{arg: '\''},
 		{arg: '"'},
 		{arg: '#'},
 		{arg: '%'},
 	}
 	for _, c := range cs {
 		t.Run(string(c.arg), func(t *testing.T) {
 			result := isAlpha(c.arg)
 			if result != c.expected {
 				t.Errorf("got '%v', expected '%v'", result, c.expected)
 			}
@ -150,7 +101,11 @@ func Test_alphaPercent(t *testing.T) {
 		expected float64
 	}{
 		{
-			arg:      "abcd",
+			arg:      "abcdefghijklmnopqrstuvwxyz",
 			expected: 100.0,
 		},
 		{
 			arg:      "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
 			expected: 100.0,
 		},
 		{