generalize cutup code
This commit is contained in:
parent
000946c175
commit
c66dbaf013
19
cmd/cutup.go
19
cmd/cutup.go
@ -1,6 +1,8 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
"github.com/vilmibm/trunkless/cutup"
|
"github.com/vilmibm/trunkless/cutup"
|
||||||
)
|
)
|
||||||
@ -9,6 +11,7 @@ func init() {
|
|||||||
rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files")
|
rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files")
|
||||||
rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up")
|
rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up")
|
||||||
rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up")
|
rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up")
|
||||||
|
rootCmd.Flags().StringP("flavor", "f", "", "set of adapters to use when cutting up")
|
||||||
|
|
||||||
rootCmd.MarkFlagRequired("cutupdir")
|
rootCmd.MarkFlagRequired("cutupdir")
|
||||||
rootCmd.MarkFlagRequired("srcdir")
|
rootCmd.MarkFlagRequired("srcdir")
|
||||||
@ -16,6 +19,8 @@ func init() {
|
|||||||
rootCmd.AddCommand(cutupCmd)
|
rootCmd.AddCommand(cutupCmd)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var validFlavors = []string{"gutenberg"}
|
||||||
|
|
||||||
var cutupCmd = &cobra.Command{
|
var cutupCmd = &cobra.Command{
|
||||||
Use: "cutup",
|
Use: "cutup",
|
||||||
Args: cobra.MaximumNArgs(1),
|
Args: cobra.MaximumNArgs(1),
|
||||||
@ -23,10 +28,24 @@ var cutupCmd = &cobra.Command{
|
|||||||
cutupdir, _ := cmd.Flags().GetString("cutupdir")
|
cutupdir, _ := cmd.Flags().GetString("cutupdir")
|
||||||
srcdir, _ := cmd.Flags().GetString("srcdir")
|
srcdir, _ := cmd.Flags().GetString("srcdir")
|
||||||
workers, _ := cmd.Flags().GetInt("workers")
|
workers, _ := cmd.Flags().GetInt("workers")
|
||||||
|
flavor, _ := cmd.Flags().GetString("flavor")
|
||||||
|
|
||||||
|
if flavor != "" {
|
||||||
|
valid := false
|
||||||
|
for _, f := range validFlavors {
|
||||||
|
if flavor == f {
|
||||||
|
valid = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !valid {
|
||||||
|
return fmt.Errorf("invalid flavor '%s'; valid flavors: %v", flavor, validFlavors)
|
||||||
|
}
|
||||||
|
}
|
||||||
opts := cutup.CutupOpts{
|
opts := cutup.CutupOpts{
|
||||||
CutupDir: cutupdir,
|
CutupDir: cutupdir,
|
||||||
SrcDir: srcdir,
|
SrcDir: srcdir,
|
||||||
NumWorkers: workers,
|
NumWorkers: workers,
|
||||||
|
Flavor: flavor,
|
||||||
}
|
}
|
||||||
return cutup.Cutup(opts)
|
return cutup.Cutup(opts)
|
||||||
},
|
},
|
||||||
|
144
cutup/cutup.go
144
cutup/cutup.go
@ -10,7 +10,83 @@ import (
|
|||||||
"github.com/vilmibm/trunkless/db"
|
"github.com/vilmibm/trunkless/db"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TODO generalize so it's not gutenberg specific
|
type CutupOpts struct {
|
||||||
|
SrcDir string
|
||||||
|
CutupDir string
|
||||||
|
NumWorkers int
|
||||||
|
Flavor string
|
||||||
|
headerEndCheck func(string) bool
|
||||||
|
footerBeginCheck func(string) bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func defaultHeaderEndCheck(string) bool { return true }
|
||||||
|
func defaultFooterBeginCheck(string) bool { return false }
|
||||||
|
|
||||||
|
func gutenbergHeaderEndCheck(s string) bool {
|
||||||
|
return strings.HasPrefix(s, "*** START")
|
||||||
|
}
|
||||||
|
|
||||||
|
func gutenbergFooterBeginCheck(s string) bool {
|
||||||
|
return strings.HasPrefix(s, "*** END")
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractGutenbergTitle(s string) string {
|
||||||
|
title, _ := strings.CutPrefix(s, "*** START OF THE PROJECT GUTENBERG")
|
||||||
|
title, _ = strings.CutPrefix(title, " EBOOK")
|
||||||
|
return strings.TrimSpace(strings.Map(rep, title))
|
||||||
|
}
|
||||||
|
|
||||||
|
func Cutup(opts CutupOpts) error {
|
||||||
|
if opts.Flavor == "gutenberg" {
|
||||||
|
opts.headerEndCheck = gutenbergHeaderEndCheck
|
||||||
|
opts.footerBeginCheck = gutenbergFooterBeginCheck
|
||||||
|
} else {
|
||||||
|
opts.headerEndCheck = defaultHeaderEndCheck
|
||||||
|
opts.footerBeginCheck = defaultFooterBeginCheck
|
||||||
|
}
|
||||||
|
err := os.Mkdir(opts.CutupDir, 0775)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
src, err := os.Open(opts.SrcDir)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
entries, err := src.Readdirnames(-1)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
paths := make(chan string, len(entries))
|
||||||
|
sources := make(chan string, len(entries))
|
||||||
|
|
||||||
|
for x := 0; x < opts.NumWorkers; x++ {
|
||||||
|
go worker(opts, paths, sources)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, e := range entries {
|
||||||
|
paths <- path.Join(opts.SrcDir, e)
|
||||||
|
}
|
||||||
|
close(paths)
|
||||||
|
|
||||||
|
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
|
||||||
|
ixFile, err := os.Create(ixPath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("could not open '%s': %w", ixPath, err)
|
||||||
|
}
|
||||||
|
defer ixFile.Close()
|
||||||
|
|
||||||
|
for i := 0; i < len(entries); i++ {
|
||||||
|
l := <-sources
|
||||||
|
fmt.Printf("%d/%d\r", i+1, len(entries))
|
||||||
|
fmt.Fprintln(ixFile, l)
|
||||||
|
}
|
||||||
|
close(sources)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
|
func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
|
||||||
for p := range paths {
|
for p := range paths {
|
||||||
@ -34,18 +110,19 @@ func worker(opts CutupOpts, paths <-chan string, sources chan<- string) {
|
|||||||
|
|
||||||
for s.Scan() {
|
for s.Scan() {
|
||||||
text = strings.TrimSpace(s.Text())
|
text = strings.TrimSpace(s.Text())
|
||||||
if strings.HasPrefix(text, "*** START") {
|
if opts.headerEndCheck(text) {
|
||||||
title, _ = strings.CutPrefix(text, "*** START OF THE PROJECT GUTENBERG")
|
if opts.Flavor == "gutenberg" {
|
||||||
title, _ = strings.CutPrefix(title, " EBOOK")
|
title = extractGutenbergTitle(text)
|
||||||
title = strings.Map(rep, title)
|
continue
|
||||||
title = strings.TrimSpace(title)
|
} else {
|
||||||
|
title = path.Base(p)
|
||||||
|
}
|
||||||
inHeader = false
|
inHeader = false
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
if inHeader {
|
if inHeader {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if strings.HasPrefix(text, "*** END") {
|
if opts.footerBeginCheck(text) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if title == "" {
|
if title == "" {
|
||||||
@ -216,54 +293,3 @@ func clean(bs []byte) string {
|
|||||||
|
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
type CutupOpts struct {
|
|
||||||
SrcDir string
|
|
||||||
CutupDir string
|
|
||||||
NumWorkers int
|
|
||||||
}
|
|
||||||
|
|
||||||
func Cutup(opts CutupOpts) error {
|
|
||||||
err := os.Mkdir(opts.CutupDir, 0775)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not make '%s': %w", opts.CutupDir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
src, err := os.Open(opts.SrcDir)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not open '%s' for reading: %w", opts.SrcDir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
entries, err := src.Readdirnames(-1)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not read '%s': %w", opts.SrcDir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
paths := make(chan string, len(entries))
|
|
||||||
sources := make(chan string, len(entries))
|
|
||||||
|
|
||||||
for x := 0; x < opts.NumWorkers; x++ {
|
|
||||||
go worker(opts, paths, sources)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, e := range entries {
|
|
||||||
paths <- path.Join(opts.SrcDir, e)
|
|
||||||
}
|
|
||||||
close(paths)
|
|
||||||
|
|
||||||
ixPath := path.Join(opts.CutupDir, "_title_index.csv")
|
|
||||||
ixFile, err := os.Create(ixPath)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not open '%s': %w", ixPath, err)
|
|
||||||
}
|
|
||||||
defer ixFile.Close()
|
|
||||||
|
|
||||||
for i := 0; i < len(entries); i++ {
|
|
||||||
l := <-sources
|
|
||||||
fmt.Printf("%d/%d\r", i+1, len(entries))
|
|
||||||
fmt.Fprintln(ixFile, l)
|
|
||||||
}
|
|
||||||
close(sources)
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user