generalize cutup code

This commit is contained in:
nate smith 2024-04-28 21:43:16 -07:00
parent 000946c175
commit c66dbaf013
2 changed files with 104 additions and 59 deletions

View File

@ -1,6 +1,8 @@
package cmd
import (
"fmt"
"github.com/spf13/cobra"
"github.com/vilmibm/trunkless/cutup"
)
@ -9,6 +11,7 @@ func init() {
rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files")
rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up")
rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up")
rootCmd.Flags().StringP("flavor", "f", "", "set of adapters to use when cutting up")
rootCmd.MarkFlagRequired("cutupdir")
rootCmd.MarkFlagRequired("srcdir")
@ -16,6 +19,8 @@ func init() {
rootCmd.AddCommand(cutupCmd)
}
var validFlavors = []string{"gutenberg"}
var cutupCmd = &cobra.Command{
Use: "cutup",
Args: cobra.MaximumNArgs(1),
@ -23,10 +28,24 @@ var cutupCmd = &cobra.Command{
cutupdir, _ := cmd.Flags().GetString("cutupdir")
srcdir, _ := cmd.Flags().GetString("srcdir")
workers, _ := cmd.Flags().GetInt("workers")
flavor, _ := cmd.Flags().GetString("flavor")
if flavor != "" {
valid := false
for _, f := range validFlavors {
if flavor == f {
valid = true
}
}
if !valid {
return fmt.Errorf("invalid flavor '%s'; valid flavors: %v", flavor, validFlavors)
}
}
opts := cutup.CutupOpts{
CutupDir: cutupdir,
SrcDir: srcdir,
NumWorkers: workers,
Flavor: flavor,
}
return cutup.Cutup(opts)
},

View File

@ -10,7 +10,83 @@ import (
"github.com/vilmibm/trunkless/db"
)
// TODO generalize so it's not gutenberg specific
type CutupOpts struct {
SrcDir string
CutupDir string
NumWorkers int
Flavor string
headerEndCheck func(string) bool
footerBeginCheck func(string) bool
}
func defaultHeaderEndCheck(string) bool { return true }
func defaultFooterBeginCheck(string) bool { return false }
func gutenbergHeaderEndCheck(s string) bool {
return strings.HasPrefix(s, "*** START")
}
func gutenbergFooterBeginCheck(s string) bool {
return strings.HasPrefix(s, "*** END")
}
func extractGutenbergTitle(s string) string {
title, _ := strings.CutPrefix(s, "*** START OF THE PROJECT GUTENBERG")
title, _ = strings.CutPrefix(title, " EBOOK")
return strings.TrimSpace(strings.Map(rep, title))
}
func Cutup(opts CutupOpts) error {
if opts.Flavor == "gutenberg" {
opts.headerEndCheck = gutenbergHeaderEndCheck
opts.footerBeginCheck = gutenbergFooterBeginCheck
} else {
opts.headerEndCheck = defaultHeaderEndCheck
opts.footerBeginCheck = defaultFooterBeginCheck
}
err := os.Mkdir(opts.CutupDir, 0775)
if err != nil {
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
}
src, err := os.Open(opts.SrcDir)
if err != nil {
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
}
entries, err := src.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
}
paths := make(chan string, len(entries))
sources := make(chan string, len(entries))
for x := 0; x < opts.NumWorkers; x++ {
go worker(opts, paths, sources)
}
for _, e := range entries {
paths <- path.Join(opts.SrcDir, e)
}
close(paths)
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
ixFile, err := os.Create(ixPath)
if err != nil {
return fmt.Errorf("could not open '%s': %w", ixPath, err)
}
defer ixFile.Close()
for i := 0; i < len(entries); i++ {
l := <-sources
fmt.Printf("%d/%d\r", i+1, len(entries))
fmt.Fprintln(ixFile, l)
}
close(sources)
return nil
}
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
for p := range paths {
@ -34,18 +110,19 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
for s.Scan() {
text = strings.TrimSpace(s.Text())
if strings.HasPrefix(text, "*** START") {
title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG")
title, _ = strings.CutPrefix(title, " EBOOK")
title = strings.Map(rep, title)
title = strings.TrimSpace(title)
if opts.headerEndCheck(text) {
if opts.Flavor == "gutenberg" {
title = extractGutenbergTitle(text)
continue
} else {
title = path.Base(p)
}
inHeader = false
continue
}
if inHeader {
continue
}
if strings.HasPrefix(text, "*** END") {
if opts.footerBeginCheck(text) {
break
}
if title == "" {
@ -216,54 +293,3 @@ func clean(bs []byte) string {
return s
}
type CutupOpts struct {
SrcDir string
CutupDir string
NumWorkers int
}
func Cutup(opts CutupOpts) error {
err := os.Mkdir(opts.CutupDir, 0775)
if err != nil {
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
}
src, err := os.Open(opts.SrcDir)
if err != nil {
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
}
entries, err := src.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
}
paths := make(chan string, len(entries))
sources := make(chan string, len(entries))
for x := 0; x < opts.NumWorkers; x++ {
go worker(opts, paths, sources)
}
for _, e := range entries {
paths <- path.Join(opts.SrcDir, e)
}
close(paths)
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
ixFile, err := os.Create(ixPath)
if err != nil {
return fmt.Errorf("could not open '%s': %w", ixPath, err)
}
defer ixFile.Close()
for i := 0; i < len(entries); i++ {
l := <-sources
fmt.Printf("%d/%d\r", i+1, len(entries))
fmt.Fprintln(ixFile, l)
}
close(sources)
return nil
}