generalize cutup code

This commit is contained in:
nate smith 2024-04-28 21:43:16 -07:00
parent 000946c175
commit c66dbaf013
2 changed files with 104 additions and 59 deletions

View File

@ -1,6 +1,8 @@
package cmd package cmd
import ( import (
"fmt"
"github.com/spf13/cobra" "github.com/spf13/cobra"
"github.com/vilmibm/trunkless/cutup" "github.com/vilmibm/trunkless/cutup"
) )
@ -9,6 +11,7 @@ func init() {
rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files") rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files")
rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up") rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up")
rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up") rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up")
rootCmd.Flags().StringP("flavor", "f", "", "set of adapters to use when cutting up")
rootCmd.MarkFlagRequired("cutupdir") rootCmd.MarkFlagRequired("cutupdir")
rootCmd.MarkFlagRequired("srcdir") rootCmd.MarkFlagRequired("srcdir")
@ -16,6 +19,8 @@ func init() {
rootCmd.AddCommand(cutupCmd) rootCmd.AddCommand(cutupCmd)
} }
var validFlavors = []string{"gutenberg"}
var cutupCmd = &cobra.Command{ var cutupCmd = &cobra.Command{
Use: "cutup", Use: "cutup",
Args: cobra.MaximumNArgs(1), Args: cobra.MaximumNArgs(1),
@ -23,10 +28,24 @@ var cutupCmd = &cobra.Command{
cutupdir, _ := cmd.Flags().GetString("cutupdir") cutupdir, _ := cmd.Flags().GetString("cutupdir")
srcdir, _ := cmd.Flags().GetString("srcdir") srcdir, _ := cmd.Flags().GetString("srcdir")
workers, _ := cmd.Flags().GetInt("workers") workers, _ := cmd.Flags().GetInt("workers")
flavor, _ := cmd.Flags().GetString("flavor")
if flavor != "" {
valid := false
for _, f := range validFlavors {
if flavor == f {
valid = true
}
}
if !valid {
return fmt.Errorf("invalid flavor '%s'; valid flavors: %v", flavor, validFlavors)
}
}
opts := cutup.CutupOpts{ opts := cutup.CutupOpts{
CutupDir: cutupdir, CutupDir: cutupdir,
SrcDir: srcdir, SrcDir: srcdir,
NumWorkers: workers, NumWorkers: workers,
Flavor: flavor,
} }
return cutup.Cutup(opts) return cutup.Cutup(opts)
}, },

View File

@ -10,7 +10,83 @@ import (
"github.com/vilmibm/trunkless/db" "github.com/vilmibm/trunkless/db"
) )
// TODO generalize so it's not gutenberg specific type CutupOpts struct {
SrcDir string
CutupDir string
NumWorkers int
Flavor string
headerEndCheck func(string) bool
footerBeginCheck func(string) bool
}
func defaultHeaderEndCheck(string) bool { return true }
func defaultFooterBeginCheck(string) bool { return false }
func gutenbergHeaderEndCheck(s string) bool {
return strings.HasPrefix(s, "*** START")
}
func gutenbergFooterBeginCheck(s string) bool {
return strings.HasPrefix(s, "*** END")
}
func extractGutenbergTitle(s string) string {
title, _ := strings.CutPrefix(s, "*** START OF THE PROJECT GUTENBERG")
title, _ = strings.CutPrefix(title, " EBOOK")
return strings.TrimSpace(strings.Map(rep, title))
}
func Cutup(opts CutupOpts) error {
if opts.Flavor == "gutenberg" {
opts.headerEndCheck = gutenbergHeaderEndCheck
opts.footerBeginCheck = gutenbergFooterBeginCheck
} else {
opts.headerEndCheck = defaultHeaderEndCheck
opts.footerBeginCheck = defaultFooterBeginCheck
}
err := os.Mkdir(opts.CutupDir, 0775)
if err != nil {
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
}
src, err := os.Open(opts.SrcDir)
if err != nil {
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
}
entries, err := src.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
}
paths := make(chan string, len(entries))
sources := make(chan string, len(entries))
for x := 0; x < opts.NumWorkers; x++ {
go worker(opts, paths, sources)
}
for _, e := range entries {
paths <- path.Join(opts.SrcDir, e)
}
close(paths)
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
ixFile, err := os.Create(ixPath)
if err != nil {
return fmt.Errorf("could not open '%s': %w", ixPath, err)
}
defer ixFile.Close()
for i := 0; i < len(entries); i++ {
l := <-sources
fmt.Printf("%d/%d\r", i+1, len(entries))
fmt.Fprintln(ixFile, l)
}
close(sources)
return nil
}
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
for p := range paths { for p := range paths {
@ -34,18 +110,19 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
for s.Scan() { for s.Scan() {
text = strings.TrimSpace(s.Text()) text = strings.TrimSpace(s.Text())
if strings.HasPrefix(text, "*** START") { if opts.headerEndCheck(text) {
title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG") if opts.Flavor == "gutenberg" {
title, _ = strings.CutPrefix(title, " EBOOK") title = extractGutenbergTitle(text)
title = strings.Map(rep, title) continue
title = strings.TrimSpace(title) } else {
title = path.Base(p)
}
inHeader = false inHeader = false
continue
} }
if inHeader { if inHeader {
continue continue
} }
if strings.HasPrefix(text, "*** END") { if opts.footerBeginCheck(text) {
break break
} }
if title == "" { if title == "" {
@ -216,54 +293,3 @@ func clean(bs []byte) string {
return s return s
} }
type CutupOpts struct {
SrcDir string
CutupDir string
NumWorkers int
}
func Cutup(opts CutupOpts) error {
err := os.Mkdir(opts.CutupDir, 0775)
if err != nil {
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
}
src, err := os.Open(opts.SrcDir)
if err != nil {
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
}
entries, err := src.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
}
paths := make(chan string, len(entries))
sources := make(chan string, len(entries))
for x := 0; x < opts.NumWorkers; x++ {
go worker(opts, paths, sources)
}
for _, e := range entries {
paths <- path.Join(opts.SrcDir, e)
}
close(paths)
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
ixFile, err := os.Create(ixPath)
if err != nil {
return fmt.Errorf("could not open '%s': %w", ixPath, err)
}
defer ixFile.Close()
for i := 0; i < len(entries); i++ {
l := <-sources
fmt.Printf("%d/%d\r", i+1, len(entries))
fmt.Fprintln(ixFile, l)
}
close(sources)
return nil
}