cmd stuff
This commit is contained in:
parent
591d169fc0
commit
f55e1482d2
21
cmd/cutup.go
21
cmd/cutup.go
@ -6,15 +6,28 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
rootCmd.Flags().StringP("cutupdir", "d", "", "directory in which to write phrase files")
|
||||||
|
rootCmd.Flags().StringP("srcdir", "s", "", "directory of files to cut up")
|
||||||
|
rootCmd.Flags().IntP("workers", "w", 10, "number of workers to use when cutting up")
|
||||||
|
|
||||||
|
rootCmd.MarkFlagRequired("cutupdir")
|
||||||
|
rootCmd.MarkFlagRequired("srcdir")
|
||||||
|
|
||||||
rootCmd.AddCommand(cutupCmd)
|
rootCmd.AddCommand(cutupCmd)
|
||||||
}
|
}
|
||||||
|
|
||||||
var cutupCmd = &cobra.Command{
|
var cutupCmd = &cobra.Command{
|
||||||
Use: "cutup [prefix]",
|
Use: "cutup",
|
||||||
Args: cobra.MaximumNArgs(1),
|
Args: cobra.MaximumNArgs(1),
|
||||||
RunE: func(cmd *cobra.Command, args []string) error {
|
RunE: func(cmd *cobra.Command, args []string) error {
|
||||||
// TODO arg for source file path
|
cutupdir, _ := cmd.Flags().GetString("cutupdir")
|
||||||
// TODO arg for target path
|
srcdir, _ := cmd.Flags().GetString("srcdir")
|
||||||
return cutup.CutupFiles()
|
workers, _ := cmd.Flags().GetInt("workers")
|
||||||
|
opts := cutup.CutupOpts{
|
||||||
|
CutupDir: cutupdir,
|
||||||
|
SrcDir: srcdir,
|
||||||
|
NumWorkers: workers,
|
||||||
|
}
|
||||||
|
return cutup.Cutup(opts)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -7,8 +7,6 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
// TODO option for cutupDir
|
|
||||||
|
|
||||||
ingestCmd.Flags().StringP("cutupdir", "d", "", "directory to files produced by cutup cmd")
|
ingestCmd.Flags().StringP("cutupdir", "d", "", "directory to files produced by cutup cmd")
|
||||||
ingestCmd.MarkFlagRequired("cutupdir")
|
ingestCmd.MarkFlagRequired("cutupdir")
|
||||||
rootCmd.AddCommand(ingestCmd)
|
rootCmd.AddCommand(ingestCmd)
|
||||||
@ -19,7 +17,7 @@ var ingestCmd = &cobra.Command{
|
|||||||
Short: "ingest already cut-up corpora from disk into database",
|
Short: "ingest already cut-up corpora from disk into database",
|
||||||
Args: cobra.ExactArgs(1),
|
Args: cobra.ExactArgs(1),
|
||||||
RunE: func(cmd *cobra.Command, args []string) error {
|
RunE: func(cmd *cobra.Command, args []string) error {
|
||||||
cutupDir := cmd.Flags().Lookup("cutupdir").Value.String()
|
cutupDir, _ := cmd.Flags().GetString("cutupdir")
|
||||||
corpus := args[0]
|
corpus := args[0]
|
||||||
|
|
||||||
conn, err := db.Connect()
|
conn, err := db.Connect()
|
||||||
@ -36,30 +34,3 @@ var ingestCmd = &cobra.Command{
|
|||||||
return ingest.Ingest(opts)
|
return ingest.Ingest(opts)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// thoughts
|
|
||||||
//
|
|
||||||
// having multitenancy in the db makes phrase selection harder. i need to determine the ID offsets for each corpus's phrase list.
|
|
||||||
// currently waiting on an explain analyze for:
|
|
||||||
// explain analyze select min(p.id),max(p.id) from phrases p join sources s on s.id = p.sourceid and s.corpusid='cb20c3e';
|
|
||||||
// planning time 12ms
|
|
||||||
// exec time 91s
|
|
||||||
// trying again with inner join which was fast but not noticeably; the explain looks the same (which makes sense--no rows with null allowed are involved).
|
|
||||||
|
|
||||||
// if i stick with this i can expect several minutes(!) of startup time to the server; however, since i'm generating ID lookups outside of sql, my lookup should still be O(1).
|
|
||||||
// some options:
|
|
||||||
// - change everything so every corpus is in its own table:
|
|
||||||
// ${corpus}_phrases: id, sourceid, text
|
|
||||||
// corpora: id, name
|
|
||||||
// sources: id, corpusid, name
|
|
||||||
// - cache the result of the min/max id analysis. i could do this to disk or in the db...i would probably do it in the db:
|
|
||||||
// id_ranges: corpusid, minid, maxid
|
|
||||||
|
|
||||||
// thinking about this more, as i add corpora the phrases table is going to
|
|
||||||
// grow into the billions (assuming other sources are similar in scale to
|
|
||||||
// gutenberg). turns out postgresql has table partitioning but idk if that will
|
|
||||||
// help me since the ID space will be shared.
|
|
||||||
|
|
||||||
// having a table per corpus's phrases will also make tearing down corpora easier -- otherwise i have to regen the entire phrases table to remove gaps in ID space.
|
|
||||||
|
|
||||||
// so it's settled; I'm going to retool for table-per-corpus.
|
|
||||||
|
@ -138,49 +138,6 @@ func worker(paths <-chan string, sources chan<- string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func CutupFiles() error {
|
|
||||||
err := os.Mkdir(tgtDir, 0770)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
dir, err := os.Open(srcDir)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not open %s: %w", srcDir, err)
|
|
||||||
}
|
|
||||||
entries, err := dir.Readdirnames(-1)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not read %s: %w", srcDir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
paths := make(chan string, len(entries))
|
|
||||||
sources := make(chan string, len(entries))
|
|
||||||
|
|
||||||
for x := 0; x < workers; x++ {
|
|
||||||
go worker(paths, sources)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, e := range entries {
|
|
||||||
paths <- path.Join(srcDir, e)
|
|
||||||
}
|
|
||||||
close(paths)
|
|
||||||
|
|
||||||
ixFile, err := os.Create(path.Join(tgtDir, "_title_index.tsv"))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not open index file: %w", err)
|
|
||||||
}
|
|
||||||
defer ixFile.Close()
|
|
||||||
|
|
||||||
for i := 0; i < len(entries); i++ {
|
|
||||||
l := <-sources
|
|
||||||
fmt.Printf("%d/%d\r", i+1, len(entries))
|
|
||||||
fmt.Fprintln(ixFile, l)
|
|
||||||
}
|
|
||||||
close(sources)
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func conjPrep(phraseBuff []byte, r rune) int {
|
func conjPrep(phraseBuff []byte, r rune) int {
|
||||||
if r != ' ' {
|
if r != ' ' {
|
||||||
return -1
|
return -1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user