trunkless/cutup/cutup.go
2024-04-28 21:13:12 -07:00

270 lines
5.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package cutup
import (
"bufio"
"fmt"
"os"
"path"
"strings"
"github.com/vilmibm/trunkless/db"
)
// TODO generalize so it's not gutenberg specific
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
for p := range paths {
f, err := os.Open(p)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to open '%s': %s\n", p, err.Error())
}
s := bufio.NewScanner(f)
phraseBuff := []byte{}
written := 0
inHeader := true
title := ""
sourceid := ""
var of *os.File
var cleaned string
var asStr string
var text string
var prefix string
for s.Scan() {
text = strings.TrimSpace(s.Text())
if strings.HasPrefix(text, "*** START") {
title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG")
title, _ = strings.CutPrefix(title, " EBOOK")
title = strings.Map(rep, title)
title = strings.TrimSpace(title)
inHeader = false
continue
}
if inHeader {
continue
}
if strings.HasPrefix(text, "*** END") {
break
}
if title == "" {
fmt.Fprintf(os.Stderr, "got to cutup phase with no title: '%s'", p)
break
}
if sourceid == "" {
sourceid = db.StrToID(title)
prefix = sourceid + "\t"
of, err = os.Create(path.Join(opts.CutupDir, sourceid))
if err != nil {
fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error())
break
}
}
for i, r := range text {
if v := shouldBreak(phraseBuff, r); v > 0 {
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
if len(phraseBuff) >= 10 {
cleaned = clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Fprintln(of, prefix+cleaned)
written++
}
}
phraseBuff = []byte{}
} else {
asStr = string(phraseBuff)
if r == ' ' && strings.HasSuffix(asStr, " ") {
continue
}
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
phraseBuff = append(phraseBuff, byte(' '))
}
phraseBuff = append(phraseBuff, byte(r))
}
}
}
of.Close()
if written == 0 {
// there are a bunch of empty books in gutenberg :( these are text files
// that just have start and end markers with nothing in between. nothing
// i can do about it.
fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p)
}
sources <- fmt.Sprintf("%s\t%s", sourceid, title)
}
}
var phraseMarkers = map[rune]bool{
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
')': true,
'}': true,
']': true,
'”': true,
'=': true,
'`': true,
'-': true,
'|': true,
'>': true,
}
var suffices = []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
const maxSuffixLen = 8 // magic number based on longest suffix
func shouldBreak(phraseBuff []byte, r rune) int {
if ok := phraseMarkers[r]; ok {
return 1
}
if r != ' ' {
return -1
}
offset := len(phraseBuff) - maxSuffixLen
if offset < 0 {
offset = 0
}
end := string(phraseBuff[offset:])
for _, s := range suffices {
if strings.HasSuffix(end, " "+s) {
return len(s)
}
}
return -1
}
func alphaPercent(s string) float64 {
total := 0.0
alpha := 0.0
for _, r := range s {
total++
if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') {
alpha++
}
}
return 100 * (alpha / total)
}
func rep(r rune) (s rune) {
s = r
switch s {
case '':
return '\''
case '“':
return '"'
case '”':
return '"'
case '"':
return -1
case '(':
return -1
case '[':
return -1
case '{':
return -1
case '<':
return -1
case '_':
return -1
case '*':
return -1
case '\r':
return -1
case '\t':
return -1
case '\n': // should not need this but stray \n ending up in output...
return -1
case 0x1c:
return -1
case 0x19:
return -1
case 0x01:
return -1
case 0x0f:
return -1
case 0x00:
return -1
case 0xb0:
return -1
case 0x1b:
return -1
case '\\':
return '/'
}
return
}
func clean(bs []byte) string {
s := strings.ToLower(
strings.TrimSpace(
strings.TrimRight(
strings.TrimLeft(
strings.Map(rep, strings.ToValidUTF8(string(bs), "")), "'\""), "'\"")))
if alphaPercent(s) < 50.0 {
return ""
}
return s
}
type CutupOpts struct {
SrcDir string
CutupDir string
NumWorkers int
}
func Cutup(opts CutupOpts) error {
err := os.Mkdir(opts.CutupDir, 0775)
if err != nil {
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
}
src, err := os.Open(opts.SrcDir)
if err != nil {
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
}
entries, err := src.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
}
paths := make(chan string, len(entries))
sources := make(chan string, len(entries))
for x := 0; x < opts.NumWorkers; x++ {
go worker(opts, paths, sources)
}
for _, e := range entries {
paths <- path.Join(opts.SrcDir, e)
}
close(paths)
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
ixFile, err := os.Create(ixPath)
if err != nil {
return fmt.Errorf("could not open '%s': %w", ixPath, err)
}
defer ixFile.Close()
for i := 0; i < len(entries); i++ {
l := <-sources
fmt.Printf("%d/%d\r", i+1, len(entries))
fmt.Fprintln(ixFile, l)
}
close(sources)
return nil
}