This commit is contained in:
nate smith 2024-04-28 21:13:12 -07:00
parent f55e1482d2
commit 000946c175
2 changed files with 45 additions and 168 deletions

View File

@ -2,48 +2,17 @@ package cutup
import (
"bufio"
"crypto/sha1"
"fmt"
"os"
"path"
"strings"
"github.com/vilmibm/trunkless/db"
)
const (
srcDir = "/home/vilmibm/pg_plaintext/files"
tgtDir = "/home/vilmibm/pg_plaintext/cutup"
workers = 10
)
// TODO configurable src/tgt dir
// TODO generalize so it's not gutenberg specific
func worker(paths <-chan string, sources chan<- string) {
// TODO generalize to n character phrase markers, write new function
phraseMarkers := map[rune]bool{
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
//'(': true,
')': true,
//'{': true,
'}': true,
//'[': true,
']': true,
//'\'': true,
//'"': true,
//'“': true,
'”': true,
'=': true,
'`': true,
'-': true,
'|': true,
'>': true,
}
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
for p := range paths {
f, err := os.Open(p)
if err != nil {
@ -59,7 +28,6 @@ func worker(paths <-chan string, sources chan<- string) {
var of *os.File
var cleaned string
var ok bool
var asStr string
var text string
var prefix string
@ -85,28 +53,17 @@ func worker(paths <-chan string, sources chan<- string) {
break
}
if sourceid == "" {
sourceid = fmt.Sprintf("%x", sha1.Sum([]byte(title)))[0:6]
sourceid = db.StrToID(title)
prefix = sourceid + "\t"
of, err = os.Create(path.Join(tgtDir, sourceid))
of, err = os.Create(path.Join(opts.CutupDir, sourceid))
if err != nil {
fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error())
break
}
}
for i, r := range text {
if ok = phraseMarkers[r]; ok {
if len(phraseBuff) >= 10 {
cleaned = clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Fprintln(of, prefix+cleaned)
written++
}
}
phraseBuff = []byte{}
} else if v := conjPrep(phraseBuff, r); v > 0 {
// TODO erase or keep? starting with erase.
if v := shouldBreak(phraseBuff, r); v > 0 {
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
// TODO this pasta is copied
if len(phraseBuff) >= 10 {
cleaned = clean(phraseBuff)
if len(cleaned) > 0 {
@ -138,14 +95,38 @@ func worker(paths <-chan string, sources chan<- string) {
}
}
func conjPrep(phraseBuff []byte, r rune) int {
var phraseMarkers = map[rune]bool{
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
')': true,
'}': true,
']': true,
'”': true,
'=': true,
'`': true,
'-': true,
'|': true,
'>': true,
}
var suffices = []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
const maxSuffixLen = 8 // magic number based on longest suffix
func shouldBreak(phraseBuff []byte, r rune) int {
if ok := phraseMarkers[r]; ok {
return 1
}
if r != ' ' {
return -1
}
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
maxLen := 8 // TODO magic number based on longest suffix
offset := len(phraseBuff) - maxLen
offset := len(phraseBuff) - maxSuffixLen
if offset < 0 {
offset = 0
}
@ -155,76 +136,17 @@ func conjPrep(phraseBuff []byte, r rune) int {
return len(s)
}
}
return -1
}
func isAlpha(r rune) bool {
// TODO use rune numerical ranges for this
switch strings.ToLower(string(r)) {
case "a":
return true
case "b":
return true
case "c":
return true
case "d":
return true
case "e":
return true
case "f":
return true
case "g":
return true
case "h":
return true
case "i":
return true
case "j":
return true
case "k":
return true
case "l":
return true
case "m":
return true
case "n":
return true
case "o":
return true
case "p":
return true
case "q":
return true
case "r":
return true
case "s":
return true
case "t":
return true
case "u":
return true
case "v":
return true
case "w":
return true
case "x":
return true
case "y":
return true
case "z":
return true
}
return false
}
func alphaPercent(s string) float64 {
total := 0.0
alpha := 0.0
for _, r := range s {
total++
if isAlpha(r) {
if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') {
alpha++
}
}
@ -321,7 +243,7 @@ func Cutup(opts CutupOpts) error {
sources := make(chan string, len(entries))
for x := 0; x < opts.NumWorkers; x++ {
go worker(paths, sources)
go worker(opts, paths, sources)
}
for _, e := range entries {

View File

@ -2,7 +2,7 @@ package cutup
import "testing"
func Test_conjPrep(t *testing.T) {
func Test_shouldBreak(t *testing.T) {
type args struct {
buff []byte
r rune
@ -82,61 +82,12 @@ func Test_conjPrep(t *testing.T) {
args: args{[]byte("whether good or"), ' '},
expected: 2,
},
// TODO test phrasemarkers
}
for _, c := range cs {
t.Run(c.name, func(t *testing.T) {
result := conjPrep(c.args.buff, c.args.r)
if result != c.expected {
t.Errorf("got '%v', expected '%v'", result, c.expected)
}
})
}
}
func Test_isAlpha(t *testing.T) {
cs := []struct {
arg rune
expected bool
}{
{arg: 'a', expected: true},
{arg: 'b', expected: true},
{arg: 'c', expected: true},
{arg: 'd', expected: true},
{arg: 'e', expected: true},
{arg: 'f', expected: true},
{arg: 'g', expected: true},
{arg: 'h', expected: true},
{arg: 'i', expected: true},
{arg: 'j', expected: true},
{arg: 'k', expected: true},
{arg: 'l', expected: true},
{arg: 'm', expected: true},
{arg: 'n', expected: true},
{arg: 'o', expected: true},
{arg: 'p', expected: true},
{arg: 'q', expected: true},
{arg: 'r', expected: true},
{arg: 's', expected: true},
{arg: 't', expected: true},
{arg: 'u', expected: true},
{arg: 'v', expected: true},
{arg: 'w', expected: true},
{arg: 'x', expected: true},
{arg: 'y', expected: true},
{arg: 'z', expected: true},
{arg: '1'},
{arg: '2'},
{arg: '3'},
{arg: '\''},
{arg: '"'},
{arg: '#'},
{arg: '%'},
}
for _, c := range cs {
t.Run(string(c.arg), func(t *testing.T) {
result := isAlpha(c.arg)
result := shouldBreak(c.args.buff, c.args.r)
if result != c.expected {
t.Errorf("got '%v', expected '%v'", result, c.expected)
}
@ -150,7 +101,11 @@ func Test_alphaPercent(t *testing.T) {
expected float64
}{
{
arg: "abcd",
arg: "abcdefghijklmnopqrstuvwxyz",
expected: 100.0,
},
{
arg: "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
expected: 100.0,
},
{