diff --git a/cmd/cutup.go b/cmd/cutup.go index fdd43c8..9b99fea 100644 --- a/cmd/cutup.go +++ b/cmd/cutup.go @@ -1,6 +1,8 @@ package cmd import ( + "fmt" + "github.com/spf13/cobra" "github.com/vilmibm/trunkless/cutup" ) @@ -9,6 +11,7 @@ func init() { rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files") rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up") rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up") + rootCmd.Flags().StringP("flavor", "f", "", "set of adapters to use when cutting up") rootCmd.MarkFlagRequired("cutupdir") rootCmd.MarkFlagRequired("srcdir") @@ -16,6 +19,8 @@ func init() { rootCmd.AddCommand(cutupCmd) } +var validFlavors = []string{"gutenberg"} + var cutupCmd = &cobra.Command{ Use: "cutup", Args: cobra.MaximumNArgs(1), @@ -23,10 +28,24 @@ var cutupCmd = &cobra.Command{ cutupdir, _ := cmd.Flags().GetString("cutupdir") srcdir, _ := cmd.Flags().GetString("srcdir") workers, _ := cmd.Flags().GetInt("workers") + flavor, _ := cmd.Flags().GetString("flavor") + + if flavor != "" { + valid := false + for _, f := range validFlavors { + if flavor == f { + valid = true + } + } + if !valid { + return fmt.Errorf("invalid flavor '%s'; valid flavors: %v", flavor, validFlavors) + } + } opts := cutup.CutupOpts{ CutupDir: cutupdir, SrcDir: srcdir, NumWorkers: workers, + Flavor: flavor, } return cutup.Cutup(opts) }, diff --git a/cutup/cutup.go b/cutup/cutup.go index 7d9ad44..c8c8556 100644 --- a/cutup/cutup.go +++ b/cutup/cutup.go @@ -10,7 +10,83 @@ import ( "github.com/vilmibm/trunkless/db" ) -// TODO generalize so it's not gutenberg specific +type CutupOpts struct { + SrcDir string + CutupDir string + NumWorkers int + Flavor string + headerEndCheck func(string) bool + footerBeginCheck func(string) bool +} + +func defaultHeaderEndCheck(string) bool { return true } +func defaultFooterBeginCheck(string) bool { return false } + +func gutenbergHeaderEndCheck(s string) bool { + return strings.HasPrefix(s, "*** START") +} + +func gutenbergFooterBeginCheck(s string) bool { + return strings.HasPrefix(s, "*** END") +} + +func extractGutenbergTitle(s string) string { + title, _ := strings.CutPrefix(s, "*** START OF THE PROJECT GUTENBERG") + title, _ = strings.CutPrefix(title, " EBOOK") + return strings.TrimSpace(strings.Map(rep, title)) +} + +func Cutup(opts CutupOpts) error { + if opts.Flavor == "gutenberg" { + opts.headerEndCheck = gutenbergHeaderEndCheck + opts.footerBeginCheck = gutenbergFooterBeginCheck + } else { + opts.headerEndCheck = defaultHeaderEndCheck + opts.footerBeginCheck = defaultFooterBeginCheck + } + err := os.Mkdir(opts.CutupDir, 0775) + if err != nil { + return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err) + } + + src, err := os.Open(opts.SrcDir) + if err != nil { + return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err) + } + + entries, err := src.Readdirnames(-1) + if err != nil { + return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err) + } + + paths := make(chan string, len(entries)) + sources := make(chan string, len(entries)) + + for x := 0; x < opts.NumWorkers; x++ { + go worker(opts, paths, sources) + } + + for _, e := range entries { + paths <- path.Join(opts.SrcDir, e) + } + close(paths) + + ixPath := path.Join(opts.CutupDir, "_title_index.csv") + ixFile, err := os.Create(ixPath) + if err != nil { + return fmt.Errorf("could not open '%s': %w", ixPath, err) + } + defer ixFile.Close() + + for i := 0; i < len(entries); i++ { + l := <-sources + fmt.Printf("%d/%d\r", i+1, len(entries)) + fmt.Fprintln(ixFile, l) + } + close(sources) + + return nil +} func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { for p := range paths { @@ -34,18 +110,19 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) { for s.Scan() { text = strings.TrimSpace(s.Text()) - if strings.HasPrefix(text, "*** START") { - title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG") - title, _ = strings.CutPrefix(title, " EBOOK") - title = strings.Map(rep, title) - title = strings.TrimSpace(title) + if opts.headerEndCheck(text) { + if opts.Flavor == "gutenberg" { + title = extractGutenbergTitle(text) + continue + } else { + title = path.Base(p) + } inHeader = false - continue } if inHeader { continue } - if strings.HasPrefix(text, "*** END") { + if opts.footerBeginCheck(text) { break } if title == "" { @@ -216,54 +293,3 @@ func clean(bs []byte) string { return s } - -type CutupOpts struct { - SrcDir string - CutupDir string - NumWorkers int -} - -func Cutup(opts CutupOpts) error { - err := os.Mkdir(opts.CutupDir, 0775) - if err != nil { - return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err) - } - - src, err := os.Open(opts.SrcDir) - if err != nil { - return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err) - } - - entries, err := src.Readdirnames(-1) - if err != nil { - return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err) - } - - paths := make(chan string, len(entries)) - sources := make(chan string, len(entries)) - - for x := 0; x < opts.NumWorkers; x++ { - go worker(opts, paths, sources) - } - - for _, e := range entries { - paths <- path.Join(opts.SrcDir, e) - } - close(paths) - - ixPath := path.Join(opts.CutupDir, "_title_index.csv") - ixFile, err := os.Create(ixPath) - if err != nil { - return fmt.Errorf("could not open '%s': %w", ixPath, err) - } - defer ixFile.Close() - - for i := 0; i < len(entries); i++ { - l := <-sources - fmt.Printf("%d/%d\r", i+1, len(entries)) - fmt.Fprintln(ixFile, l) - } - close(sources) - - return nil -}