diff --git a/cmd/ingest.go b/cmd/ingest.go index 3673d9e..b859db5 100644 --- a/cmd/ingest.go +++ b/cmd/ingest.go @@ -1,24 +1,65 @@ package cmd import ( - "fmt" - "github.com/spf13/cobra" + "github.com/vilmibm/trunkless/db" "github.com/vilmibm/trunkless/ingest" ) func init() { + // TODO option for cutupDir + + ingestCmd.Flags().StringP("cutupdir", "d", "", "directory to files produced by cutup cmd") + ingestCmd.MarkFlagRequired("cutupdir") rootCmd.AddCommand(ingestCmd) } var ingestCmd = &cobra.Command{ - Use: "ingest corpusname", - Args: cobra.ExactArgs(1), + Use: "ingest corpusname", + Short: "ingest already cut-up corpora from disk into database", + Args: cobra.ExactArgs(1), RunE: func(cmd *cobra.Command, args []string) error { - switch args[0] { - case "gutenberg": - return ingest.IngestGut() + cutupDir := cmd.Flags().Lookup("cutupdir").Value.String() + corpus := args[0] + + conn, err := db.Connect() + if err != nil { + return err } - return fmt.Errorf("corpus unknown: %s", args[0]) + + opts := ingest.IngestOpts{ + Conn: conn, + CutupDir: cutupDir, + Corpus: corpus, + } + + return ingest.Ingest(opts) }, } + +// thoughts +// +// having multitenancy in the db makes phrase selection harder. i need to determine the ID offsets for each corpus's phrase list. +// currently waiting on an explain analyze for: +// explain analyze select min(p.id),max(p.id) from phrases p join sources s on s.id = p.sourceid and s.corpusid='cb20c3e'; +// planning time 12ms +// exec time 91s +// trying again with inner join which was fast but not noticeably; the explain looks the same (which makes sense--no rows with null allowed are involved). + +// if i stick with this i can expect several minutes(!) of startup time to the server; however, since i'm generating ID lookups outside of sql, my lookup should still be O(1). +// some options: +// - change everything so every corpus is in its own table: +// ${corpus}_phrases: id, sourceid, text +// corpora: id, name +// sources: id, corpusid, name +// - cache the result of the min/max id analysis. i could do this to disk or in the db...i would probably do it in the db: +// id_ranges: corpusid, minid, maxid + +// thinking about this more, as i add corpora the phrases table is going to +// grow into the billions (assuming other sources are similar in scale to +// gutenberg). turns out postgresql has table partitioning but idk if that will +// help me since the ID space will be shared. + +// having a table per corpus's phrases will also make tearing down corpora easier -- otherwise i have to regen the entire phrases table to remove gaps in ID space. + +// so it's settled; I'm going to retool for table-per-corpus. diff --git a/db/db.go b/db/db.go index f6a3ab5..709d8e4 100644 --- a/db/db.go +++ b/db/db.go @@ -2,6 +2,7 @@ package db import ( "context" + "crypto/sha1" "fmt" "github.com/jackc/pgx/v5" @@ -13,6 +14,10 @@ const ( MaxID = 345507789 ) +func StrToID(s string) string { + return fmt.Sprintf("%x", sha1.Sum([]byte(s)))[0:6] +} + func Connect() (*pgx.Conn, error) { conn, err := pgx.Connect(context.Background(), "") if err != nil { diff --git a/ingest/ingest.go b/ingest/ingest.go index ffff006..27ad63f 100644 --- a/ingest/ingest.go +++ b/ingest/ingest.go @@ -8,6 +8,7 @@ import ( "path" "strings" + "github.com/jackc/pgx/v5" "github.com/vilmibm/trunkless/db" ) @@ -16,7 +17,9 @@ const cutupDir = "/home/vilmibm/pg_plaintext/cutup" // TODO // - [X] finalize gutenberg ingestion // - [ ] clean up commands -// - [ ] clean up repo +// - [X] get down to just ingest/cutup/serve +// - [ ] add arguments for generalizing +// - [X] clean up repo // - [ ] push and deploy to town with new pg db // - [ ] gamefaqs extraction // - [ ] corpus selector @@ -25,24 +28,20 @@ const cutupDir = "/home/vilmibm/pg_plaintext/cutup" // - [ ] blog post // - [ ] launch -func IngestGut() error { - conn, err := db.Connect() - if err != nil { - return err - } - defer conn.Close(context.Background()) +type IngestOpts struct { + Conn *pgx.Conn + Corpus string + CutupDir string +} - dir, err := os.Open(cutupDir) +func Ingest(o IngestOpts) error { + conn := o.Conn + + dir, err := os.Open(o.CutupDir) if err != nil { return fmt.Errorf("could not open %s: %w", cutupDir, err) } - - // echo gutenberg | sha1sum | head -c7 - corpusid := "cb20c3e" - _, err = conn.Exec(context.Background(), "INSERT INTO corpora (id, name) VALUES ($1, $2) ON CONFLICT DO NOTHING", corpusid, "gutenberg") - if err != nil { - return fmt.Errorf("failed to create gutenberg corpus: %w", err) - } + defer dir.Close() entries, err := dir.Readdirnames(-1) if err != nil { @@ -53,7 +52,15 @@ func IngestGut() error { if err != nil { return fmt.Errorf("failed to open source index: %w", err) } + defer idx.Close() + corpusid := db.StrToID(o.Corpus) + _, err = conn.Exec(context.Background(), + "INSERT INTO corpora (id, name) VALUES ($1, $2) ON CONFLICT DO NOTHING", + corpusid, o.Corpus) + if err != nil { + return fmt.Errorf("failed to create '%s' corpus: %w", o.Corpus, err) + } tx, err := conn.Begin(context.Background()) if err != nil { return fmt.Errorf("could not open transaction: %w", err) @@ -84,6 +91,5 @@ func IngestGut() error { fmt.Fprintf(os.Stderr, "failed to ingest '%s': %s\n", p, err.Error()) } } - return nil }