This commit is contained in:
nate smith 2024-04-28 21:13:12 -07:00
parent f55e1482d2
commit 000946c175
2 changed files with 45 additions and 168 deletions

View File

@ -2,48 +2,17 @@ package cutup
import ( import (
"bufio" "bufio"
"crypto/sha1"
"fmt" "fmt"
"os" "os"
"path" "path"
"strings" "strings"
"github.com/vilmibm/trunkless/db"
) )
const (
srcDir = "/home/vilmibm/pg_plaintext/files"
tgtDir = "/home/vilmibm/pg_plaintext/cutup"
workers = 10
)
// TODO configurable src/tgt dir
// TODO generalize so it's not gutenberg specific // TODO generalize so it's not gutenberg specific
func worker(paths <-chan string, sources chan<- string) { func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
// TODO generalize to n character phrase markers, write new function
phraseMarkers := map[rune]bool{
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
//'(': true,
')': true,
//'{': true,
'}': true,
//'[': true,
']': true,
//'\'': true,
//'"': true,
//'“': true,
'”': true,
'=': true,
'`': true,
'-': true,
'|': true,
'>': true,
}
for p := range paths { for p := range paths {
f, err := os.Open(p) f, err := os.Open(p)
if err != nil { if err != nil {
@ -59,7 +28,6 @@ func worker(paths <-chan string, sources chan<- string) {
var of *os.File var of *os.File
var cleaned string var cleaned string
var ok bool
var asStr string var asStr string
var text string var text string
var prefix string var prefix string
@ -85,28 +53,17 @@ func worker(paths <-chan string, sources chan<- string) {
break break
} }
if sourceid == "" { if sourceid == "" {
sourceid = fmt.Sprintf("%x", sha1.Sum([]byte(title)))[0:6] sourceid = db.StrToID(title)
prefix = sourceid + "\t" prefix = sourceid + "\t"
of, err = os.Create(path.Join(tgtDir, sourceid)) of, err = os.Create(path.Join(opts.CutupDir, sourceid))
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error()) fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error())
break break
} }
} }
for i, r := range text { for i, r := range text {
if ok = phraseMarkers[r]; ok { if v := shouldBreak(phraseBuff, r); v > 0 {
if len(phraseBuff) >= 10 {
cleaned = clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Fprintln(of, prefix+cleaned)
written++
}
}
phraseBuff = []byte{}
} else if v := conjPrep(phraseBuff, r); v > 0 {
// TODO erase or keep? starting with erase.
phraseBuff = phraseBuff[0 : len(phraseBuff)-v] phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
// TODO this pasta is copied
if len(phraseBuff) >= 10 { if len(phraseBuff) >= 10 {
cleaned = clean(phraseBuff) cleaned = clean(phraseBuff)
if len(cleaned) > 0 { if len(cleaned) > 0 {
@ -138,14 +95,38 @@ func worker(paths <-chan string, sources chan<- string) {
} }
} }
func conjPrep(phraseBuff []byte, r rune) int { var phraseMarkers = map[rune]bool{
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
')': true,
'}': true,
']': true,
'”': true,
'=': true,
'`': true,
'-': true,
'|': true,
'>': true,
}
var suffices = []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
const maxSuffixLen = 8 // magic number based on longest suffix
func shouldBreak(phraseBuff []byte, r rune) int {
if ok := phraseMarkers[r]; ok {
return 1
}
if r != ' ' { if r != ' ' {
return -1 return -1
} }
suffices := []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"} offset := len(phraseBuff) - maxSuffixLen
maxLen := 8 // TODO magic number based on longest suffix
offset := len(phraseBuff) - maxLen
if offset < 0 { if offset < 0 {
offset = 0 offset = 0
} }
@ -155,76 +136,17 @@ func conjPrep(phraseBuff []byte, r rune) int {
return len(s) return len(s)
} }
} }
return -1 return -1
} }
func isAlpha(r rune) bool {
// TODO use rune numerical ranges for this
switch strings.ToLower(string(r)) {
case "a":
return true
case "b":
return true
case "c":
return true
case "d":
return true
case "e":
return true
case "f":
return true
case "g":
return true
case "h":
return true
case "i":
return true
case "j":
return true
case "k":
return true
case "l":
return true
case "m":
return true
case "n":
return true
case "o":
return true
case "p":
return true
case "q":
return true
case "r":
return true
case "s":
return true
case "t":
return true
case "u":
return true
case "v":
return true
case "w":
return true
case "x":
return true
case "y":
return true
case "z":
return true
}
return false
}
func alphaPercent(s string) float64 { func alphaPercent(s string) float64 {
total := 0.0 total := 0.0
alpha := 0.0 alpha := 0.0
for _, r := range s { for _, r := range s {
total++ total++
if isAlpha(r) { if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') {
alpha++ alpha++
} }
} }
@ -321,7 +243,7 @@ func Cutup(opts CutupOpts) error {
sources := make(chan string, len(entries)) sources := make(chan string, len(entries))
for x := 0; x < opts.NumWorkers; x++ { for x := 0; x < opts.NumWorkers; x++ {
go worker(paths, sources) go worker(opts, paths, sources)
} }
for _, e := range entries { for _, e := range entries {

View File

@ -2,7 +2,7 @@ package cutup
import "testing" import "testing"
func Test_conjPrep(t *testing.T) { func Test_shouldBreak(t *testing.T) {
type args struct { type args struct {
buff []byte buff []byte
r rune r rune
@ -82,61 +82,12 @@ func Test_conjPrep(t *testing.T) {
args: args{[]byte("whether good or"), ' '}, args: args{[]byte("whether good or"), ' '},
expected: 2, expected: 2,
}, },
// TODO test phrasemarkers
} }
for _, c := range cs { for _, c := range cs {
t.Run(c.name, func(t *testing.T) { t.Run(c.name, func(t *testing.T) {
result := conjPrep(c.args.buff, c.args.r) result := shouldBreak(c.args.buff, c.args.r)
if result != c.expected {
t.Errorf("got '%v', expected '%v'", result, c.expected)
}
})
}
}
func Test_isAlpha(t *testing.T) {
cs := []struct {
arg rune
expected bool
}{
{arg: 'a', expected: true},
{arg: 'b', expected: true},
{arg: 'c', expected: true},
{arg: 'd', expected: true},
{arg: 'e', expected: true},
{arg: 'f', expected: true},
{arg: 'g', expected: true},
{arg: 'h', expected: true},
{arg: 'i', expected: true},
{arg: 'j', expected: true},
{arg: 'k', expected: true},
{arg: 'l', expected: true},
{arg: 'm', expected: true},
{arg: 'n', expected: true},
{arg: 'o', expected: true},
{arg: 'p', expected: true},
{arg: 'q', expected: true},
{arg: 'r', expected: true},
{arg: 's', expected: true},
{arg: 't', expected: true},
{arg: 'u', expected: true},
{arg: 'v', expected: true},
{arg: 'w', expected: true},
{arg: 'x', expected: true},
{arg: 'y', expected: true},
{arg: 'z', expected: true},
{arg: '1'},
{arg: '2'},
{arg: '3'},
{arg: '\''},
{arg: '"'},
{arg: '#'},
{arg: '%'},
}
for _, c := range cs {
t.Run(string(c.arg), func(t *testing.T) {
result := isAlpha(c.arg)
if result != c.expected { if result != c.expected {
t.Errorf("got '%v', expected '%v'", result, c.expected) t.Errorf("got '%v', expected '%v'", result, c.expected)
} }
@ -150,7 +101,11 @@ func Test_alphaPercent(t *testing.T) {
expected float64 expected float64
}{ }{
{ {
arg: "abcd", arg: "abcdefghijklmnopqrstuvwxyz",
expected: 100.0,
},
{
arg: "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
expected: 100.0, expected: 100.0,
}, },
{ {