trunkless/cutup/cutup.go
2024-04-28 21:43:16 -07:00

296 lines
5.9 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package cutup
import (
"bufio"
"fmt"
"os"
"path"
"strings"
"github.com/vilmibm/trunkless/db"
)
type CutupOpts struct {
SrcDir string
CutupDir string
NumWorkers int
Flavor string
headerEndCheck func(string) bool
footerBeginCheck func(string) bool
}
func defaultHeaderEndCheck(string) bool { return true }
func defaultFooterBeginCheck(string) bool { return false }
func gutenbergHeaderEndCheck(s string) bool {
return strings.HasPrefix(s, "*** START")
}
func gutenbergFooterBeginCheck(s string) bool {
return strings.HasPrefix(s, "*** END")
}
func extractGutenbergTitle(s string) string {
title, _ := strings.CutPrefix(s, "*** START OF THE PROJECT GUTENBERG")
title, _ = strings.CutPrefix(title, " EBOOK")
return strings.TrimSpace(strings.Map(rep, title))
}
func Cutup(opts CutupOpts) error {
if opts.Flavor == "gutenberg" {
opts.headerEndCheck = gutenbergHeaderEndCheck
opts.footerBeginCheck = gutenbergFooterBeginCheck
} else {
opts.headerEndCheck = defaultHeaderEndCheck
opts.footerBeginCheck = defaultFooterBeginCheck
}
err := os.Mkdir(opts.CutupDir, 0775)
if err != nil {
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
}
src, err := os.Open(opts.SrcDir)
if err != nil {
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
}
entries, err := src.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
}
paths := make(chan string, len(entries))
sources := make(chan string, len(entries))
for x := 0; x < opts.NumWorkers; x++ {
go worker(opts, paths, sources)
}
for _, e := range entries {
paths <- path.Join(opts.SrcDir, e)
}
close(paths)
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
ixFile, err := os.Create(ixPath)
if err != nil {
return fmt.Errorf("could not open '%s': %w", ixPath, err)
}
defer ixFile.Close()
for i := 0; i < len(entries); i++ {
l := <-sources
fmt.Printf("%d/%d\r", i+1, len(entries))
fmt.Fprintln(ixFile, l)
}
close(sources)
return nil
}
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
for p := range paths {
f, err := os.Open(p)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to open '%s': %s\n", p, err.Error())
}
s := bufio.NewScanner(f)
phraseBuff := []byte{}
written := 0
inHeader := true
title := ""
sourceid := ""
var of *os.File
var cleaned string
var asStr string
var text string
var prefix string
for s.Scan() {
text = strings.TrimSpace(s.Text())
if opts.headerEndCheck(text) {
if opts.Flavor == "gutenberg" {
title = extractGutenbergTitle(text)
continue
} else {
title = path.Base(p)
}
inHeader = false
}
if inHeader {
continue
}
if opts.footerBeginCheck(text) {
break
}
if title == "" {
fmt.Fprintf(os.Stderr, "got to cutup phase with no title: '%s'", p)
break
}
if sourceid == "" {
sourceid = db.StrToID(title)
prefix = sourceid + "\t"
of, err = os.Create(path.Join(opts.CutupDir, sourceid))
if err != nil {
fmt.Fprintf(os.Stderr, "could not open '%s' for writing: %s", sourceid, err.Error())
break
}
}
for i, r := range text {
if v := shouldBreak(phraseBuff, r); v > 0 {
phraseBuff = phraseBuff[0 : len(phraseBuff)-v]
if len(phraseBuff) >= 10 {
cleaned = clean(phraseBuff)
if len(cleaned) > 0 {
fmt.Fprintln(of, prefix+cleaned)
written++
}
}
phraseBuff = []byte{}
} else {
asStr = string(phraseBuff)
if r == ' ' && strings.HasSuffix(asStr, " ") {
continue
}
if i == 0 && len(phraseBuff) > 0 && phraseBuff[len(phraseBuff)-1] != ' ' && r != ' ' {
phraseBuff = append(phraseBuff, byte(' '))
}
phraseBuff = append(phraseBuff, byte(r))
}
}
}
of.Close()
if written == 0 {
// there are a bunch of empty books in gutenberg :( these are text files
// that just have start and end markers with nothing in between. nothing
// i can do about it.
fmt.Fprintf(os.Stderr, "WARN: no content found in '%s' '%s'\n", sourceid, p)
}
sources <- fmt.Sprintf("%s\t%s", sourceid, title)
}
}
var phraseMarkers = map[rune]bool{
';': true,
',': true,
':': true,
'.': true,
'?': true,
'!': true,
')': true,
'}': true,
']': true,
'”': true,
'=': true,
'`': true,
'-': true,
'|': true,
'>': true,
}
var suffices = []string{"from", "at", "but", "however", "yet", "though", "and", "to", "on", "or"}
const maxSuffixLen = 8 // magic number based on longest suffix
func shouldBreak(phraseBuff []byte, r rune) int {
if ok := phraseMarkers[r]; ok {
return 1
}
if r != ' ' {
return -1
}
offset := len(phraseBuff) - maxSuffixLen
if offset < 0 {
offset = 0
}
end := string(phraseBuff[offset:])
for _, s := range suffices {
if strings.HasSuffix(end, " "+s) {
return len(s)
}
}
return -1
}
func alphaPercent(s string) float64 {
total := 0.0
alpha := 0.0
for _, r := range s {
total++
if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') {
alpha++
}
}
return 100 * (alpha / total)
}
func rep(r rune) (s rune) {
s = r
switch s {
case '':
return '\''
case '“':
return '"'
case '”':
return '"'
case '"':
return -1
case '(':
return -1
case '[':
return -1
case '{':
return -1
case '<':
return -1
case '_':
return -1
case '*':
return -1
case '\r':
return -1
case '\t':
return -1
case '\n': // should not need this but stray \n ending up in output...
return -1
case 0x1c:
return -1
case 0x19:
return -1
case 0x01:
return -1
case 0x0f:
return -1
case 0x00:
return -1
case 0xb0:
return -1
case 0x1b:
return -1
case '\\':
return '/'
}
return
}
func clean(bs []byte) string {
s := strings.ToLower(
strings.TrimSpace(
strings.TrimRight(
strings.TrimLeft(
strings.Map(rep, strings.ToValidUTF8(string(bs), "")), "'\""), "'\"")))
if alphaPercent(s) < 50.0 {
return ""
}
return s
}