cmd stuff

2024-04-28 20:56:28 -07:00 · 2024-04-28 20:56:28 -07:00 · f55e1482d2
commit f55e1482d2
parent 591d169fc0
3 changed files with 18 additions and 77 deletions
--- a/cmd/cutup.go
+++ b/cmd/cutup.go
@ -6,15 +6,28 @@ import (
 )

 func init() {
+	rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files")
+	rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up")
+	rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up")
+
+	rootCmd.MarkFlagRequired("cutupdir")
+	rootCmd.MarkFlagRequired("srcdir")
+
 	rootCmd.AddCommand(cutupCmd)
 }

 var cutupCmd = &cobra.Command{
-	Use:  "cutup [prefix]",
+	Use:  "cutup",
 	Args: cobra.MaximumNArgs(1),
 	RunE: func(cmd *cobra.Command, args []string) error {
-		// TODO arg for source file path
-		// TODO arg for target path
-		return cutup.CutupFiles()
+		cutupdir, _ := cmd.Flags().GetString("cutupdir")
+		srcdir, _ := cmd.Flags().GetString("srcdir")
+		workers, _ := cmd.Flags().GetInt("workers")
+		opts := cutup.CutupOpts{
+			CutupDir:   cutupdir,
+			SrcDir:     srcdir,
+			NumWorkers: workers,
+		}
+		return cutup.Cutup(opts)
 	},
 }
--- a/cmd/ingest.go
+++ b/cmd/ingest.go
@ -7,8 +7,6 @@ import (
 )

 func init() {
-	// TODO option for cutupDir
-
 	ingestCmd.Flags().StringP("cutupdir", "d", "", "directory to files produced by cutup cmd")
 	ingestCmd.MarkFlagRequired("cutupdir")
 	rootCmd.AddCommand(ingestCmd)
@ -19,7 +17,7 @@ var ingestCmd = &cobra.Command{
 	Short: "ingest already cut-up corpora from disk into database",
 	Args:  cobra.ExactArgs(1),
 	RunE: func(cmd *cobra.Command, args []string) error {
-		cutupDir := cmd.Flags().Lookup("cutupdir").Value.String()
+		cutupDir, _ := cmd.Flags().GetString("cutupdir")
 		corpus := args[0]

 		conn, err := db.Connect()
@ -36,30 +34,3 @@ var ingestCmd = &cobra.Command{
 		return ingest.Ingest(opts)
 	},
 }
-
-// thoughts
-//
-// having multitenancy in the db makes phrase selection harder. i need to determine the ID offsets for each corpus's phrase list.
-// currently waiting on an explain analyze for:
-// explain analyze select min(p.id),max(p.id) from phrases p join sources s on s.id = p.sourceid and s.corpusid='cb20c3e';
-// planning time 12ms
-// exec time 91s
-// trying again with inner join which was fast but not noticeably; the explain looks the same (which makes sense--no rows with null allowed are involved).
-
-// if i stick with this i can expect several minutes(!) of startup time to the server; however, since i'm generating ID lookups outside of sql, my lookup should still be O(1).
-// some options:
-// - change everything so every corpus is in its own table:
-//   ${corpus}_phrases: id, sourceid, text
-//   corpora: id, name
-//   sources: id, corpusid, name
-// - cache the result of the min/max id analysis. i could do this to disk or in the db...i would probably do it in the db:
-//   id_ranges: corpusid, minid, maxid
-
-// thinking about this more, as i add corpora the phrases table is going to
-// grow into the billions (assuming other sources are similar in scale to
-// gutenberg). turns out postgresql has table partitioning but idk if that will
-// help me since the ID space will be shared.
-
-// having a table per corpus's phrases will also make tearing down corpora easier -- otherwise i have to regen the entire phrases table to remove gaps in ID space.
-
-// so it's settled; I'm going to retool for table-per-corpus.
--- a/cutup/cutup.go
+++ b/cutup/cutup.go
@ -138,49 +138,6 @@ func worker(paths <-chan string, sources chan<- string) {
 	}
 }

-func CutupFiles() error {
-	err := os.Mkdir(tgtDir, 0770)
-	if err != nil {
-		return err
-	}
-
-	dir, err := os.Open(srcDir)
-	if err != nil {
-		return fmt.Errorf("could not open %s: %w", srcDir, err)
-	}
-	entries, err := dir.Readdirnames(-1)
-	if err != nil {
-		return fmt.Errorf("could not read %s: %w", srcDir, err)
-	}
-
-	paths := make(chan string, len(entries))
-	sources := make(chan string, len(entries))
-
-	for x := 0; x < workers; x++ {
-		go worker(paths, sources)
-	}
-
-	for _, e := range entries {
-		paths <- path.Join(srcDir, e)
-	}
-	close(paths)
-
-	ixFile, err := os.Create(path.Join(tgtDir, "_title_index.tsv"))
-	if err != nil {
-		return fmt.Errorf("could not open index file: %w", err)
-	}
-	defer ixFile.Close()
-
-	for i := 0; i < len(entries); i++ {
-		l := <-sources
-		fmt.Printf("%d/%d\r", i+1, len(entries))
-		fmt.Fprintln(ixFile, l)
-	}
-	close(sources)
-
-	return nil
-}
-
 func conjPrep(phraseBuff []byte, r rune) int {
 	if r != ' ' {
 		return -1