From f55e1482d2210555c7246bf08098654526c6189c Mon Sep 17 00:00:00 2001 From: nate smith Date: Sun, 28 Apr 2024 20:56:28 -0700 Subject: [PATCH] cmd stuff --- cmd/cutup.go | 21 +++++++++++++++++---- cmd/ingest.go | 31 +------------------------------ cutup/cutup.go | 43 ------------------------------------------- 3 files changed, 18 insertions(+), 77 deletions(-) diff --git a/cmd/cutup.go b/cmd/cutup.go index 65fd996..fdd43c8 100644 --- a/cmd/cutup.go +++ b/cmd/cutup.go @@ -6,15 +6,28 @@ import ( ) func init() { + rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files") + rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up") + rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up") + + rootCmd.MarkFlagRequired("cutupdir") + rootCmd.MarkFlagRequired("srcdir") + rootCmd.AddCommand(cutupCmd) } var cutupCmd = &cobra.Command{ - Use: "cutup [prefix]", + Use: "cutup", Args: cobra.MaximumNArgs(1), RunE: func(cmd *cobra.Command, args []string) error { - // TODO arg for source file path - // TODO arg for target path - return cutup.CutupFiles() + cutupdir, _ := cmd.Flags().GetString("cutupdir") + srcdir, _ := cmd.Flags().GetString("srcdir") + workers, _ := cmd.Flags().GetInt("workers") + opts := cutup.CutupOpts{ + CutupDir: cutupdir, + SrcDir: srcdir, + NumWorkers: workers, + } + return cutup.Cutup(opts) }, } diff --git a/cmd/ingest.go b/cmd/ingest.go index b859db5..e0f2609 100644 --- a/cmd/ingest.go +++ b/cmd/ingest.go @@ -7,8 +7,6 @@ import ( ) func init() { - // TODO option for cutupDir - ingestCmd.Flags().StringP("cutupdir", "d", "", "directory to files produced by cutup cmd") ingestCmd.MarkFlagRequired("cutupdir") rootCmd.AddCommand(ingestCmd) @@ -19,7 +17,7 @@ var ingestCmd = &cobra.Command{ Short: "ingest already cut-up corpora from disk into database", Args: cobra.ExactArgs(1), RunE: func(cmd *cobra.Command, args []string) error { - cutupDir := cmd.Flags().Lookup("cutupdir").Value.String() + cutupDir, _ := cmd.Flags().GetString("cutupdir") corpus := args[0] conn, err := db.Connect() @@ -36,30 +34,3 @@ var ingestCmd = &cobra.Command{ return ingest.Ingest(opts) }, } - -// thoughts -// -// having multitenancy in the db makes phrase selection harder. i need to determine the ID offsets for each corpus's phrase list. -// currently waiting on an explain analyze for: -// explain analyze select min(p.id),max(p.id) from phrases p join sources s on s.id = p.sourceid and s.corpusid='cb20c3e'; -// planning time 12ms -// exec time 91s -// trying again with inner join which was fast but not noticeably; the explain looks the same (which makes sense--no rows with null allowed are involved). - -// if i stick with this i can expect several minutes(!) of startup time to the server; however, since i'm generating ID lookups outside of sql, my lookup should still be O(1). -// some options: -// - change everything so every corpus is in its own table: -// ${corpus}_phrases: id, sourceid, text -// corpora: id, name -// sources: id, corpusid, name -// - cache the result of the min/max id analysis. i could do this to disk or in the db...i would probably do it in the db: -// id_ranges: corpusid, minid, maxid - -// thinking about this more, as i add corpora the phrases table is going to -// grow into the billions (assuming other sources are similar in scale to -// gutenberg). turns out postgresql has table partitioning but idk if that will -// help me since the ID space will be shared. - -// having a table per corpus's phrases will also make tearing down corpora easier -- otherwise i have to regen the entire phrases table to remove gaps in ID space. - -// so it's settled; I'm going to retool for table-per-corpus. diff --git a/cutup/cutup.go b/cutup/cutup.go index a14b1a0..3e1b799 100644 --- a/cutup/cutup.go +++ b/cutup/cutup.go @@ -138,49 +138,6 @@ func worker(paths <-chan string, sources chan<- string) { } } -func CutupFiles() error { - err := os.Mkdir(tgtDir, 0770) - if err != nil { - return err - } - - dir, err := os.Open(srcDir) - if err != nil { - return fmt.Errorf("could not open %s: %w", srcDir, err) - } - entries, err := dir.Readdirnames(-1) - if err != nil { - return fmt.Errorf("could not read %s: %w", srcDir, err) - } - - paths := make(chan string, len(entries)) - sources := make(chan string, len(entries)) - - for x := 0; x < workers; x++ { - go worker(paths, sources) - } - - for _, e := range entries { - paths <- path.Join(srcDir, e) - } - close(paths) - - ixFile, err := os.Create(path.Join(tgtDir, "_title_index.tsv")) - if err != nil { - return fmt.Errorf("could not open index file: %w", err) - } - defer ixFile.Close() - - for i := 0; i < len(entries); i++ { - l := <-sources - fmt.Printf("%d/%d\r", i+1, len(entries)) - fmt.Fprintln(ixFile, l) - } - close(sources) - - return nil -} - func conjPrep(phraseBuff []byte, r rune) int { if r != ' ' { return -1