cmd stuff

This commit is contained in:
nate smith 2024-04-28 20:56:28 -07:00
parent 591d169fc0
commit f55e1482d2
3 changed files with 18 additions and 77 deletions

View File

@ -6,15 +6,28 @@ import (
) )
func init() { func init() {
rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files")
rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up")
rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up")
rootCmd.MarkFlagRequired("cutupdir")
rootCmd.MarkFlagRequired("srcdir")
rootCmd.AddCommand(cutupCmd) rootCmd.AddCommand(cutupCmd)
} }
var cutupCmd = &cobra.Command{ var cutupCmd = &cobra.Command{
Use: "cutup [prefix]", Use: "cutup",
Args: cobra.MaximumNArgs(1), Args: cobra.MaximumNArgs(1),
RunE: func(cmd *cobra.Command, args []string) error { RunE: func(cmd *cobra.Command, args []string) error {
// TODO arg for source file path cutupdir, _ := cmd.Flags().GetString("cutupdir")
// TODO arg for target path srcdir, _ := cmd.Flags().GetString("srcdir")
return cutup.CutupFiles() workers, _ := cmd.Flags().GetInt("workers")
opts := cutup.CutupOpts{
CutupDir: cutupdir,
SrcDir: srcdir,
NumWorkers: workers,
}
return cutup.Cutup(opts)
}, },
} }

View File

@ -7,8 +7,6 @@ import (
) )
func init() { func init() {
// TODO option for cutupDir
ingestCmd.Flags().StringP("cutupdir", "d", "", "directory to files produced by cutup cmd") ingestCmd.Flags().StringP("cutupdir", "d", "", "directory to files produced by cutup cmd")
ingestCmd.MarkFlagRequired("cutupdir") ingestCmd.MarkFlagRequired("cutupdir")
rootCmd.AddCommand(ingestCmd) rootCmd.AddCommand(ingestCmd)
@ -19,7 +17,7 @@ var ingestCmd = &cobra.Command{
Short: "ingest already cut-up corpora from disk into database", Short: "ingest already cut-up corpora from disk into database",
Args: cobra.ExactArgs(1), Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error { RunE: func(cmd *cobra.Command, args []string) error {
cutupDir := cmd.Flags().Lookup("cutupdir").Value.String() cutupDir, _ := cmd.Flags().GetString("cutupdir")
corpus := args[0] corpus := args[0]
conn, err := db.Connect() conn, err := db.Connect()
@ -36,30 +34,3 @@ var ingestCmd = &cobra.Command{
return ingest.Ingest(opts) return ingest.Ingest(opts)
}, },
} }
// thoughts
//
// having multitenancy in the db makes phrase selection harder. i need to determine the ID offsets for each corpus's phrase list.
// currently waiting on an explain analyze for:
// explain analyze select min(p.id),max(p.id) from phrases p join sources s on s.id = p.sourceid and s.corpusid='cb20c3e';
// planning time 12ms
// exec time 91s
// trying again with inner join which was fast but not noticeably; the explain looks the same (which makes sense--no rows with null allowed are involved).
// if i stick with this i can expect several minutes(!) of startup time to the server; however, since i'm generating ID lookups outside of sql, my lookup should still be O(1).
// some options:
// - change everything so every corpus is in its own table:
// ${corpus}_phrases: id, sourceid, text
// corpora: id, name
// sources: id, corpusid, name
// - cache the result of the min/max id analysis. i could do this to disk or in the db...i would probably do it in the db:
// id_ranges: corpusid, minid, maxid
// thinking about this more, as i add corpora the phrases table is going to
// grow into the billions (assuming other sources are similar in scale to
// gutenberg). turns out postgresql has table partitioning but idk if that will
// help me since the ID space will be shared.
// having a table per corpus's phrases will also make tearing down corpora easier -- otherwise i have to regen the entire phrases table to remove gaps in ID space.
// so it's settled; I'm going to retool for table-per-corpus.

View File

@ -138,49 +138,6 @@ func worker(paths <-chan string, sources chan<- string) {
} }
} }
func CutupFiles() error {
err := os.Mkdir(tgtDir, 0770)
if err != nil {
return err
}
dir, err := os.Open(srcDir)
if err != nil {
return fmt.Errorf("could not open %s: %w", srcDir, err)
}
entries, err := dir.Readdirnames(-1)
if err != nil {
return fmt.Errorf("could not read %s: %w", srcDir, err)
}
paths := make(chan string, len(entries))
sources := make(chan string, len(entries))
for x := 0; x < workers; x++ {
go worker(paths, sources)
}
for _, e := range entries {
paths <- path.Join(srcDir, e)
}
close(paths)
ixFile, err := os.Create(path.Join(tgtDir, "_title_index.tsv"))
if err != nil {
return fmt.Errorf("could not open index file: %w", err)
}
defer ixFile.Close()
for i := 0; i < len(entries); i++ {
l := <-sources
fmt.Printf("%d/%d\r", i+1, len(entries))
fmt.Fprintln(ixFile, l)
}
close(sources)
return nil
}
func conjPrep(phraseBuff []byte, r rune) int { func conjPrep(phraseBuff []byte, r rune) int {
if r != ' ' { if r != ' ' {
return -1 return -1