generalize cutup code
This commit is contained in:
parent
000946c175
commit
c66dbaf013
19
cmd/cutup.go
19
cmd/cutup.go
@ -1,6 +1,8 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/vilmibm/trunkless/cutup"
|
||||
)
|
||||
@ -9,6 +11,7 @@ func init() {
|
||||
rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files")
|
||||
rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up")
|
||||
rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up")
|
||||
rootCmd.Flags().StringP("flavor", "f", "", "set of adapters to use when cutting up")
|
||||
|
||||
rootCmd.MarkFlagRequired("cutupdir")
|
||||
rootCmd.MarkFlagRequired("srcdir")
|
||||
@ -16,6 +19,8 @@ func init() {
|
||||
rootCmd.AddCommand(cutupCmd)
|
||||
}
|
||||
|
||||
var validFlavors = []string{"gutenberg"}
|
||||
|
||||
var cutupCmd = &cobra.Command{
|
||||
Use: "cutup",
|
||||
Args: cobra.MaximumNArgs(1),
|
||||
@ -23,10 +28,24 @@ var cutupCmd = &cobra.Command{
|
||||
cutupdir, _ := cmd.Flags().GetString("cutupdir")
|
||||
srcdir, _ := cmd.Flags().GetString("srcdir")
|
||||
workers, _ := cmd.Flags().GetInt("workers")
|
||||
flavor, _ := cmd.Flags().GetString("flavor")
|
||||
|
||||
if flavor != "" {
|
||||
valid := false
|
||||
for _, f := range validFlavors {
|
||||
if flavor == f {
|
||||
valid = true
|
||||
}
|
||||
}
|
||||
if !valid {
|
||||
return fmt.Errorf("invalid flavor '%s'; valid flavors: %v", flavor, validFlavors)
|
||||
}
|
||||
}
|
||||
opts := cutup.CutupOpts{
|
||||
CutupDir: cutupdir,
|
||||
SrcDir: srcdir,
|
||||
NumWorkers: workers,
|
||||
Flavor: flavor,
|
||||
}
|
||||
return cutup.Cutup(opts)
|
||||
},
|
||||
|
144
cutup/cutup.go
144
cutup/cutup.go
@ -10,7 +10,83 @@ import (
|
||||
"github.com/vilmibm/trunkless/db"
|
||||
)
|
||||
|
||||
// TODO generalize so it's not gutenberg specific
|
||||
type CutupOpts struct {
|
||||
SrcDir string
|
||||
CutupDir string
|
||||
NumWorkers int
|
||||
Flavor string
|
||||
headerEndCheck func(string) bool
|
||||
footerBeginCheck func(string) bool
|
||||
}
|
||||
|
||||
func defaultHeaderEndCheck(string) bool { return true }
|
||||
func defaultFooterBeginCheck(string) bool { return false }
|
||||
|
||||
func gutenbergHeaderEndCheck(s string) bool {
|
||||
return strings.HasPrefix(s, "*** START")
|
||||
}
|
||||
|
||||
func gutenbergFooterBeginCheck(s string) bool {
|
||||
return strings.HasPrefix(s, "*** END")
|
||||
}
|
||||
|
||||
func extractGutenbergTitle(s string) string {
|
||||
title, _ := strings.CutPrefix(s, "*** START OF THE PROJECT GUTENBERG")
|
||||
title, _ = strings.CutPrefix(title, " EBOOK")
|
||||
return strings.TrimSpace(strings.Map(rep, title))
|
||||
}
|
||||
|
||||
func Cutup(opts CutupOpts) error {
|
||||
if opts.Flavor == "gutenberg" {
|
||||
opts.headerEndCheck = gutenbergHeaderEndCheck
|
||||
opts.footerBeginCheck = gutenbergFooterBeginCheck
|
||||
} else {
|
||||
opts.headerEndCheck = defaultHeaderEndCheck
|
||||
opts.footerBeginCheck = defaultFooterBeginCheck
|
||||
}
|
||||
err := os.Mkdir(opts.CutupDir, 0775)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
|
||||
}
|
||||
|
||||
src, err := os.Open(opts.SrcDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
|
||||
}
|
||||
|
||||
entries, err := src.Readdirnames(-1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
|
||||
}
|
||||
|
||||
paths := make(chan string, len(entries))
|
||||
sources := make(chan string, len(entries))
|
||||
|
||||
for x := 0; x < opts.NumWorkers; x++ {
|
||||
go worker(opts, paths, sources)
|
||||
}
|
||||
|
||||
for _, e := range entries {
|
||||
paths <- path.Join(opts.SrcDir, e)
|
||||
}
|
||||
close(paths)
|
||||
|
||||
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
|
||||
ixFile, err := os.Create(ixPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not open '%s': %w", ixPath, err)
|
||||
}
|
||||
defer ixFile.Close()
|
||||
|
||||
for i := 0; i < len(entries); i++ {
|
||||
l := <-sources
|
||||
fmt.Printf("%d/%d\r", i+1, len(entries))
|
||||
fmt.Fprintln(ixFile, l)
|
||||
}
|
||||
close(sources)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
|
||||
for p := range paths {
|
||||
@ -34,18 +110,19 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
|
||||
|
||||
for s.Scan() {
|
||||
text = strings.TrimSpace(s.Text())
|
||||
if strings.HasPrefix(text, "*** START") {
|
||||
title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG")
|
||||
title, _ = strings.CutPrefix(title, " EBOOK")
|
||||
title = strings.Map(rep, title)
|
||||
title = strings.TrimSpace(title)
|
||||
inHeader = false
|
||||
if opts.headerEndCheck(text) {
|
||||
if opts.Flavor == "gutenberg" {
|
||||
title = extractGutenbergTitle(text)
|
||||
continue
|
||||
} else {
|
||||
title = path.Base(p)
|
||||
}
|
||||
inHeader = false
|
||||
}
|
||||
if inHeader {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(text, "*** END") {
|
||||
if opts.footerBeginCheck(text) {
|
||||
break
|
||||
}
|
||||
if title == "" {
|
||||
@ -216,54 +293,3 @@ func clean(bs []byte) string {
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
type CutupOpts struct {
|
||||
SrcDir string
|
||||
CutupDir string
|
||||
NumWorkers int
|
||||
}
|
||||
|
||||
func Cutup(opts CutupOpts) error {
|
||||
err := os.Mkdir(opts.CutupDir, 0775)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
|
||||
}
|
||||
|
||||
src, err := os.Open(opts.SrcDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
|
||||
}
|
||||
|
||||
entries, err := src.Readdirnames(-1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
|
||||
}
|
||||
|
||||
paths := make(chan string, len(entries))
|
||||
sources := make(chan string, len(entries))
|
||||
|
||||
for x := 0; x < opts.NumWorkers; x++ {
|
||||
go worker(opts, paths, sources)
|
||||
}
|
||||
|
||||
for _, e := range entries {
|
||||
paths <- path.Join(opts.SrcDir, e)
|
||||
}
|
||||
close(paths)
|
||||
|
||||
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
|
||||
ixFile, err := os.Create(ixPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not open '%s': %w", ixPath, err)
|
||||
}
|
||||
defer ixFile.Close()
|
||||
|
||||
for i := 0; i < len(entries); i++ {
|
||||
l := <-sources
|
||||
fmt.Printf("%d/%d\r", i+1, len(entries))
|
||||
fmt.Fprintln(ixFile, l)
|
||||
}
|
||||
close(sources)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user